1
0
mirror of https://github.com/golang/go synced 2024-11-26 19:51:17 -07:00

go/doc: simplify and robustify link detection logic

To fix #5043, we added logic to allow balanced pairs of parenthesis
so that we could match URLs like:
	http://example.com/some_resource(foo)

Howewer, such logic breaks when parsing something like the following:
	art by [https://example.com/person][Person Name]].
such that the following is considered the link:
	https://example.com/person][Person

Since the logic added in #5043 was just a heuristic, we adjust
the heuristic that in addition to requiring balanced pairs,
the first parenthesis must be an opening one.

For further robustness, we apply this heuristic to
parenthesis, braces, and brackets.

Fixes #22285

Change-Id: I23b728a644e35ce3995b05a79129cad2c1e3b1ce
Reviewed-on: https://go-review.googlesource.com/c/94876
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
This commit is contained in:
Joe Tsai 2018-02-16 14:52:10 -08:00 committed by Robert Griesemer
parent a6293995c5
commit 190a5f8fd2
2 changed files with 28 additions and 61 deletions

View File

@ -54,7 +54,7 @@ const (
identRx = `[\pL_][\pL_0-9]*`
// Regexp for URLs
// Match parens, and check in pairedParensPrefixLen for balance - see #5043
// Match parens, and check later for balance - see #5043, #22285
// Match .,:;?! within path, but not at end - see #18139, #16565
// This excludes some rare yet valid urls ending in common punctuation
// in order to allow sentences ending in URLs.
@ -86,29 +86,6 @@ var (
html_endh = []byte("</h3>\n")
)
// pairedParensPrefixLen returns the length of the longest prefix of s containing paired parentheses.
func pairedParensPrefixLen(s string) int {
parens := 0
l := len(s)
for i, ch := range s {
switch ch {
case '(':
if parens == 0 {
l = i
}
parens++
case ')':
parens--
if parens == 0 {
l = len(s)
} else if parens < 0 {
return i
}
}
}
return l
}
// Emphasize and escape a line of text for HTML. URLs are converted into links;
// if the URL also appears in the words map, the link is taken from the map (if
// the corresponding map value is the empty string, the URL is not converted
@ -128,13 +105,27 @@ func emphasize(w io.Writer, line string, words map[string]string, nice bool) {
// write text before match
commentEscape(w, line[0:m[0]], nice)
// adjust match if necessary
// adjust match for URLs
match := line[m[0]:m[1]]
if n := pairedParensPrefixLen(match); n < len(match) {
// match contains unpaired parentheses (rare);
// redo matching with shortened line for correct indices
m = matchRx.FindStringSubmatchIndex(line[:m[0]+n])
match = match[:n]
if strings.Contains(match, "://") {
m0, m1 := m[0], m[1]
for _, s := range []string{"()", "{}", "[]"} {
open, close := s[:1], s[1:] // E.g., "(" and ")"
// require opening parentheses before closing parentheses (#22285)
if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
m1 = m0 + i
match = line[m0:m1]
}
// require balanced pairs of parentheses (#5043)
for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
m1 = strings.LastIndexAny(line[:m1], s)
match = line[m0:m1]
}
}
if m1 != m[1] {
// redo matching with shortened line for correct indices
m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
}
}
// analyze match

View File

@ -151,6 +151,7 @@ func TestToText(t *testing.T) {
var emphasizeTests = []struct {
in, out string
}{
{"", ""},
{"http://[::1]:8080/foo.txt", `<a href="http://[::1]:8080/foo.txt">http://[::1]:8080/foo.txt</a>`},
{"before (https://www.google.com) after", `before (<a href="https://www.google.com">https://www.google.com</a>) after`},
{"before https://www.google.com:30/x/y/z:b::c. After", `before <a href="https://www.google.com:30/x/y/z:b::c">https://www.google.com:30/x/y/z:b::c</a>. After`},
@ -169,7 +170,13 @@ var emphasizeTests = []struct {
{"Hello http://example.com/%2f/ /world.", `Hello <a href="http://example.com/%2f/">http://example.com/%2f/</a> /world.`},
{"Lorem http: ipsum //host/path", "Lorem http: ipsum //host/path"},
{"javascript://is/not/linked", "javascript://is/not/linked"},
{"http://foo", `<a href="http://foo">http://foo</a>`},
{"art by [[https://www.example.com/person/][Person Name]]", `art by [[<a href="https://www.example.com/person/">https://www.example.com/person/</a>][Person Name]]`},
{"please visit (http://golang.org/)", `please visit (<a href="http://golang.org/">http://golang.org/</a>)`},
{"please visit http://golang.org/hello())", `please visit <a href="http://golang.org/hello()">http://golang.org/hello()</a>)`},
{"http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD", `<a href="http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD">http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD</a>`},
{"https://foo.bar/bal/x(])", `<a href="https://foo.bar/bal/x(">https://foo.bar/bal/x(</a>])`}, // inner ] causes (]) to be cut off from URL
{"foo [ http://bar(])", `foo [ <a href="http://bar(">http://bar(</a>])`}, // outer [ causes ]) to be cut off from URL
}
func TestEmphasize(t *testing.T) {
@ -183,37 +190,6 @@ func TestEmphasize(t *testing.T) {
}
}
var pairedParensPrefixLenTests = []struct {
in, out string
}{
{"", ""},
{"foo", "foo"},
{"()", "()"},
{"foo()", "foo()"},
{"foo()()()", "foo()()()"},
{"foo()((()()))", "foo()((()()))"},
{"foo()((()()))bar", "foo()((()()))bar"},
{"foo)", "foo"},
{"foo))", "foo"},
{"foo)))))", "foo"},
{"(foo", ""},
{"((foo", ""},
{"(((((foo", ""},
{"(foo)", "(foo)"},
{"((((foo))))", "((((foo))))"},
{"foo()())", "foo()()"},
{"foo((()())", "foo"},
{"foo((()())) (() foo ", "foo((()())) "},
}
func TestPairedParensPrefixLen(t *testing.T) {
for i, tt := range pairedParensPrefixLenTests {
if out := tt.in[:pairedParensPrefixLen(tt.in)]; out != tt.out {
t.Errorf("#%d: mismatch\nhave: %q\nwant: %q", i, out, tt.out)
}
}
}
func TestCommentEscape(t *testing.T) {
commentTests := []struct {
in, out string