1
0
mirror of https://github.com/golang/go synced 2024-11-25 07:07:57 -07:00

html: tokenize "a < b" as one whole text token.

R=andybalholm
CC=golang-dev
https://golang.org/cl/5284042
This commit is contained in:
Nigel Tao 2011-10-16 20:50:11 +11:00
parent 1135fc3978
commit 1887907fee
2 changed files with 128 additions and 23 deletions

View File

@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
z.nextBogusComment()
return
default:
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
z.nextText()
return
}
// Read the tag name and attribute key/value pairs.
z.readTagName()
for {
if z.skipWhiteSpace(); z.err != nil {
break
z.tt = ErrorToken
return
}
for {
c := z.readByte()
if z.err != nil || c == '>' {
break
@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
if z.skipWhiteSpace(); z.err != nil {
break
}
}
// Check for a self-closing token.
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
}
}
// nextText reads all text up until an '<'.
// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
// nextText reads all text up until a start tag "<a", end tag "</a", comment
// "<!" or XML processing instruction "<?".
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() {
for {
c := z.readByte()
if z.err != nil {
break
}
if c != '<' {
continue
}
c = z.readByte()
if z.err != nil {
break
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
z.raw.end -= 2
break
}
if c != '/' {
continue
}
c = z.readByte()
if z.err != nil {
break
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.raw.end -= 3
break
}
}
z.data = z.raw
return
}
if c == '<' {
z.raw.end--
z.data = z.raw
return
}
}
}
// Next scans the next token and returns its type.

View File

@ -21,6 +21,11 @@ type tokenTest struct {
}
var tokenTests = []tokenTest{
{
"empty",
"",
"",
},
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
// Angle brackets that aren't a tag.
{
"not a tag #0",
"<",
"&lt;",
},
{
"not a tag #1",
"</",
"&lt;/",
},
/*
// TODO: re-enable these tests when we tokenize them correctly.
{
"not a tag #2",
"</>",
"",
},
{
"not a tag #3",
"a</>b",
"a$b",
},
*/
{
"not a tag #4",
"</ >",
"<!-- -->",
},
{
"not a tag #5",
"a < b",
"a &lt; b",
},
{
"not a tag #6",
"<.>",
"&lt;.&gt;",
},
{
"not a tag #7",
"a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c",
},
{
"not a tag #8",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
},
// EOF in a tag name.
{
"tag name eof #0",
"<a",
"",
},
{
"tag name eof #1",
"<a ",
"",
},
{
"tag name eof #2",
"a<b",
"a",
},
{
"tag name eof #3",
"<a><b",
"<a>",
},
{
"tag name eof #4",
`<a x`,
`<a x="">`,
},
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
},
{
"Attributes with a solitary single quote",
"<p id=can't><p id=won't>",
"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
`<p id=can't><p id=won't>`,
`<p id="can&apos;t">$<p id="won&apos;t">`,
},
}
@ -267,6 +347,7 @@ loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z.ReturnComments = true
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
@ -278,6 +359,7 @@ loop:
continue loop
}
}
}
z.Next()
if z.Error() != os.EOF {
t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())