go/doc: simplify and robustify link detection logic

To fix #5043, we added logic to allow balanced pairs of parenthesis so that we could match URLs like: http://example.com/some_resource(foo) Howewer, such logic breaks when parsing something like the following: art by [https://example.com/person][Person Name]]. such that the following is considered the link: https://example.com/person][Person Since the logic added in #5043 was just a heuristic, we adjust the heuristic that in addition to requiring balanced pairs, the first parenthesis must be an opening one. For further robustness, we apply this heuristic to parenthesis, braces, and brackets. Fixes #22285 Change-Id: I23b728a644e35ce3995b05a79129cad2c1e3b1ce Reviewed-on: https://go-review.googlesource.com/c/94876 Run-TryBot: Robert Griesemer <gri@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Robert Griesemer <gri@golang.org>
2024-11-26 19:51:17 -07:00 · 2018-02-16 14:52:10 -08:00 · 2018-02-16 14:52:10 -08:00 · 190a5f8fd2
commit 190a5f8fd2
parent a6293995c5
2 changed files with 28 additions and 61 deletions
--- a/src/go/doc/comment.go
+++ b/src/go/doc/comment.go
@ -54,7 +54,7 @@ const (
 	identRx = `[\pL_][\pL_0-9]*`

 	// Regexp for URLs
-	// Match parens, and check in pairedParensPrefixLen for balance - see #5043
+	// Match parens, and check later for balance - see #5043, #22285
 	// Match .,:;?! within path, but not at end - see #18139, #16565
 	// This excludes some rare yet valid urls ending in common punctuation
 	// in order to allow sentences ending in URLs.
@ -86,29 +86,6 @@ var (
 	html_endh   = []byte("</h3>\n")
 )

-// pairedParensPrefixLen returns the length of the longest prefix of s containing paired parentheses.
-func pairedParensPrefixLen(s string) int {
-	parens := 0
-	l := len(s)
-	for i, ch := range s {
-		switch ch {
-		case '(':
-			if parens == 0 {
-				l = i
-			}
-			parens++
-		case ')':
-			parens--
-			if parens == 0 {
-				l = len(s)
-			} else if parens < 0 {
-				return i
-			}
-		}
-	}
-	return l
-}
-
 // Emphasize and escape a line of text for HTML. URLs are converted into links;
 // if the URL also appears in the words map, the link is taken from the map (if
 // the corresponding map value is the empty string, the URL is not converted
@ -128,13 +105,27 @@ func emphasize(w io.Writer, line string, words map[string]string, nice bool) {
 		// write text before match
 		commentEscape(w, line[0:m[0]], nice)

-		// adjust match if necessary
+		// adjust match for URLs
 		match := line[m[0]:m[1]]
-		if n := pairedParensPrefixLen(match); n < len(match) {
-			// match contains unpaired parentheses (rare);
-			// redo matching with shortened line for correct indices
-			m = matchRx.FindStringSubmatchIndex(line[:m[0]+n])
-			match = match[:n]
+		if strings.Contains(match, "://") {
+			m0, m1 := m[0], m[1]
+			for _, s := range []string{"()", "{}", "[]"} {
+				open, close := s[:1], s[1:] // E.g., "(" and ")"
+				// require opening parentheses before closing parentheses (#22285)
+				if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
+					m1 = m0 + i
+					match = line[m0:m1]
+				}
+				// require balanced pairs of parentheses (#5043)
+				for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
+					m1 = strings.LastIndexAny(line[:m1], s)
+					match = line[m0:m1]
+				}
+			}
+			if m1 != m[1] {
+				// redo matching with shortened line for correct indices
+				m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
+			}
 		}

 		// analyze match
--- a/src/go/doc/comment_test.go
+++ b/src/go/doc/comment_test.go
@ -151,6 +151,7 @@ func TestToText(t *testing.T) {
 var emphasizeTests = []struct {
 	in, out string
 }{
+	{"", ""},
 	{"http://[::1]:8080/foo.txt", `<a href="http://[::1]:8080/foo.txt">http://[::1]:8080/foo.txt</a>`},
 	{"before (https://www.google.com) after", `before (<a href="https://www.google.com">https://www.google.com</a>) after`},
 	{"before https://www.google.com:30/x/y/z:b::c. After", `before <a href="https://www.google.com:30/x/y/z:b::c">https://www.google.com:30/x/y/z:b::c</a>. After`},
@ -169,7 +170,13 @@ var emphasizeTests = []struct {
 	{"Hello http://example.com/%2f/ /world.", `Hello <a href="http://example.com/%2f/">http://example.com/%2f/</a> /world.`},
 	{"Lorem http: ipsum //host/path", "Lorem http: ipsum //host/path"},
 	{"javascript://is/not/linked", "javascript://is/not/linked"},
+	{"http://foo", `<a href="http://foo">http://foo</a>`},
+	{"art by [[https://www.example.com/person/][Person Name]]", `art by [[<a href="https://www.example.com/person/">https://www.example.com/person/</a>][Person Name]]`},
+	{"please visit (http://golang.org/)", `please visit (<a href="http://golang.org/">http://golang.org/</a>)`},
+	{"please visit http://golang.org/hello())", `please visit <a href="http://golang.org/hello()">http://golang.org/hello()</a>)`},
 	{"http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD", `<a href="http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD">http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD</a>`},
+	{"https://foo.bar/bal/x(])", `<a href="https://foo.bar/bal/x(">https://foo.bar/bal/x(</a>])`}, // inner ] causes (]) to be cut off from URL
+	{"foo [ http://bar(])", `foo [ <a href="http://bar(">http://bar(</a>])`},                      // outer [ causes ]) to be cut off from URL
 }

 func TestEmphasize(t *testing.T) {
@ -183,37 +190,6 @@ func TestEmphasize(t *testing.T) {
 	}
 }

-var pairedParensPrefixLenTests = []struct {
-	in, out string
-}{
-	{"", ""},
-	{"foo", "foo"},
-	{"()", "()"},
-	{"foo()", "foo()"},
-	{"foo()()()", "foo()()()"},
-	{"foo()((()()))", "foo()((()()))"},
-	{"foo()((()()))bar", "foo()((()()))bar"},
-	{"foo)", "foo"},
-	{"foo))", "foo"},
-	{"foo)))))", "foo"},
-	{"(foo", ""},
-	{"((foo", ""},
-	{"(((((foo", ""},
-	{"(foo)", "(foo)"},
-	{"((((foo))))", "((((foo))))"},
-	{"foo()())", "foo()()"},
-	{"foo((()())", "foo"},
-	{"foo((()())) (() foo ", "foo((()())) "},
-}
-
-func TestPairedParensPrefixLen(t *testing.T) {
-	for i, tt := range pairedParensPrefixLenTests {
-		if out := tt.in[:pairedParensPrefixLen(tt.in)]; out != tt.out {
-			t.Errorf("#%d: mismatch\nhave: %q\nwant: %q", i, out, tt.out)
-		}
-	}
-}
-
 func TestCommentEscape(t *testing.T) {
 	commentTests := []struct {
 		in, out string