1
0
mirror of https://github.com/golang/go synced 2024-11-22 03:24:41 -07:00

html: tokenize "a < b" as one whole text token.

R=andybalholm
CC=golang-dev
https://golang.org/cl/5284042
This commit is contained in:
Nigel Tao 2011-10-16 20:50:11 +11:00
parent 1135fc3978
commit 1887907fee
2 changed files with 128 additions and 23 deletions

View File

@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
z.nextBogusComment() z.nextBogusComment()
return return
default: default:
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") z.nextText()
return return
} }
// Read the tag name and attribute key/value pairs. // Read the tag name and attribute key/value pairs.
z.readTagName() z.readTagName()
if z.skipWhiteSpace(); z.err != nil {
z.tt = ErrorToken
return
}
for { for {
if z.skipWhiteSpace(); z.err != nil {
break
}
c := z.readByte() c := z.readByte()
if z.err != nil || c == '>' { if z.err != nil || c == '>' {
break break
@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
if z.pendingAttr[0].start != z.pendingAttr[0].end { if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr) z.attr = append(z.attr, z.pendingAttr)
} }
if z.skipWhiteSpace(); z.err != nil {
break
}
} }
// Check for a self-closing token. // Check for a self-closing token.
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' { if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
} }
} }
// nextText reads all text up until an '<'. // nextText reads all text up until a start tag "<a", end tag "</a", comment
// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end. // "<!" or XML processing instruction "<?".
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() { func (z *Tokenizer) nextText() {
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw break
return
} }
if c == '<' { if c != '<' {
z.raw.end-- continue
z.data = z.raw }
return c = z.readByte()
if z.err != nil {
break
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
z.raw.end -= 2
break
}
if c != '/' {
continue
}
c = z.readByte()
if z.err != nil {
break
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.raw.end -= 3
break
} }
} }
z.data = z.raw
} }
// Next scans the next token and returns its type. // Next scans the next token and returns its type.

View File

@ -21,6 +21,11 @@ type tokenTest struct {
} }
var tokenTests = []tokenTest{ var tokenTests = []tokenTest{
{
"empty",
"",
"",
},
// A single text node. The tokenizer should not break text nodes on whitespace, // A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node. // nor should it normalize whitespace within a text node.
{ {
@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
"<a>b<c/>d</e>", "<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>", "<a>$b$<c/>$d$</e>",
}, },
// Angle brackets that aren't a tag.
{
"not a tag #0",
"<",
"&lt;",
},
{
"not a tag #1",
"</",
"&lt;/",
},
/*
// TODO: re-enable these tests when we tokenize them correctly.
{
"not a tag #2",
"</>",
"",
},
{
"not a tag #3",
"a</>b",
"a$b",
},
*/
{
"not a tag #4",
"</ >",
"<!-- -->",
},
{
"not a tag #5",
"a < b",
"a &lt; b",
},
{
"not a tag #6",
"<.>",
"&lt;.&gt;",
},
{
"not a tag #7",
"a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c",
},
{
"not a tag #8",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
},
// EOF in a tag name.
{
"tag name eof #0",
"<a",
"",
},
{
"tag name eof #1",
"<a ",
"",
},
{
"tag name eof #2",
"a<b",
"a",
},
{
"tag name eof #3",
"<a><b",
"<a>",
},
{
"tag name eof #4",
`<a x`,
`<a x="">`,
},
// Some malformed tags that are missing a '>'. // Some malformed tags that are missing a '>'.
{ {
"malformed tag #0", "malformed tag #0",
@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
}, },
{ {
"Attributes with a solitary single quote", "Attributes with a solitary single quote",
"<p id=can't><p id=won't>", `<p id=can't><p id=won't>`,
"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">", `<p id="can&apos;t">$<p id="won&apos;t">`,
}, },
} }
@ -267,15 +347,17 @@ loop:
for _, tt := range tokenTests { for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z.ReturnComments = true z.ReturnComments = true
for i, s := range strings.Split(tt.golden, "$") { if tt.golden != "" {
if z.Next() == ErrorToken { for i, s := range strings.Split(tt.golden, "$") {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) if z.Next() == ErrorToken {
continue loop t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
} continue loop
actual := z.Token().String() }
if s != actual { actual := z.Token().String()
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) if s != actual {
continue loop t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
continue loop
}
} }
} }
z.Next() z.Next()