mirror of
https://github.com/golang/go
synced 2024-11-22 02:54:39 -07:00
html: tokenize "a < b" as one whole text token.
R=andybalholm CC=golang-dev https://golang.org/cl/5284042
This commit is contained in:
parent
1135fc3978
commit
1887907fee
@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
|
||||
z.nextBogusComment()
|
||||
return
|
||||
default:
|
||||
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
|
||||
z.nextText()
|
||||
return
|
||||
}
|
||||
// Read the tag name and attribute key/value pairs.
|
||||
z.readTagName()
|
||||
if z.skipWhiteSpace(); z.err != nil {
|
||||
z.tt = ErrorToken
|
||||
return
|
||||
}
|
||||
for {
|
||||
if z.skipWhiteSpace(); z.err != nil {
|
||||
break
|
||||
}
|
||||
c := z.readByte()
|
||||
if z.err != nil || c == '>' {
|
||||
break
|
||||
@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
|
||||
if z.pendingAttr[0].start != z.pendingAttr[0].end {
|
||||
z.attr = append(z.attr, z.pendingAttr)
|
||||
}
|
||||
if z.skipWhiteSpace(); z.err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
// Check for a self-closing token.
|
||||
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
|
||||
@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
|
||||
}
|
||||
}
|
||||
|
||||
// nextText reads all text up until an '<'.
|
||||
// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
|
||||
// nextText reads all text up until a start tag "<a", end tag "</a", comment
|
||||
// "<!" or XML processing instruction "<?".
|
||||
// Pre-condition: z.tt == TextToken && z.err == nil &&
|
||||
// z.raw.start + 1 <= z.raw.end.
|
||||
func (z *Tokenizer) nextText() {
|
||||
for {
|
||||
c := z.readByte()
|
||||
if z.err != nil {
|
||||
z.data = z.raw
|
||||
return
|
||||
break
|
||||
}
|
||||
if c == '<' {
|
||||
z.raw.end--
|
||||
z.data = z.raw
|
||||
return
|
||||
if c != '<' {
|
||||
continue
|
||||
}
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
break
|
||||
}
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
|
||||
z.raw.end -= 2
|
||||
break
|
||||
}
|
||||
if c != '/' {
|
||||
continue
|
||||
}
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
break
|
||||
}
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
||||
z.raw.end -= 3
|
||||
break
|
||||
}
|
||||
}
|
||||
z.data = z.raw
|
||||
}
|
||||
|
||||
// Next scans the next token and returns its type.
|
||||
|
@ -21,6 +21,11 @@ type tokenTest struct {
|
||||
}
|
||||
|
||||
var tokenTests = []tokenTest{
|
||||
{
|
||||
"empty",
|
||||
"",
|
||||
"",
|
||||
},
|
||||
// A single text node. The tokenizer should not break text nodes on whitespace,
|
||||
// nor should it normalize whitespace within a text node.
|
||||
{
|
||||
@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
|
||||
"<a>b<c/>d</e>",
|
||||
"<a>$b$<c/>$d$</e>",
|
||||
},
|
||||
// Angle brackets that aren't a tag.
|
||||
{
|
||||
"not a tag #0",
|
||||
"<",
|
||||
"<",
|
||||
},
|
||||
{
|
||||
"not a tag #1",
|
||||
"</",
|
||||
"</",
|
||||
},
|
||||
/*
|
||||
// TODO: re-enable these tests when we tokenize them correctly.
|
||||
{
|
||||
"not a tag #2",
|
||||
"</>",
|
||||
"",
|
||||
},
|
||||
{
|
||||
"not a tag #3",
|
||||
"a</>b",
|
||||
"a$b",
|
||||
},
|
||||
*/
|
||||
{
|
||||
"not a tag #4",
|
||||
"</ >",
|
||||
"<!-- -->",
|
||||
},
|
||||
{
|
||||
"not a tag #5",
|
||||
"a < b",
|
||||
"a < b",
|
||||
},
|
||||
{
|
||||
"not a tag #6",
|
||||
"<.>",
|
||||
"<.>",
|
||||
},
|
||||
{
|
||||
"not a tag #7",
|
||||
"a<<<b>>>c",
|
||||
"a<<$<b>$>>c",
|
||||
},
|
||||
{
|
||||
"not a tag #8",
|
||||
"if x<0 and y < 0 then x*y>0",
|
||||
"if x<0 and y < 0 then x*y>0",
|
||||
},
|
||||
// EOF in a tag name.
|
||||
{
|
||||
"tag name eof #0",
|
||||
"<a",
|
||||
"",
|
||||
},
|
||||
{
|
||||
"tag name eof #1",
|
||||
"<a ",
|
||||
"",
|
||||
},
|
||||
{
|
||||
"tag name eof #2",
|
||||
"a<b",
|
||||
"a",
|
||||
},
|
||||
{
|
||||
"tag name eof #3",
|
||||
"<a><b",
|
||||
"<a>",
|
||||
},
|
||||
{
|
||||
"tag name eof #4",
|
||||
`<a x`,
|
||||
`<a x="">`,
|
||||
},
|
||||
// Some malformed tags that are missing a '>'.
|
||||
{
|
||||
"malformed tag #0",
|
||||
@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
|
||||
},
|
||||
{
|
||||
"Attributes with a solitary single quote",
|
||||
"<p id=can't><p id=won't>",
|
||||
"<p id=\"can't\">$<p id=\"won't\">",
|
||||
`<p id=can't><p id=won't>`,
|
||||
`<p id="can't">$<p id="won't">`,
|
||||
},
|
||||
}
|
||||
|
||||
@ -267,15 +347,17 @@ loop:
|
||||
for _, tt := range tokenTests {
|
||||
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
|
||||
z.ReturnComments = true
|
||||
for i, s := range strings.Split(tt.golden, "$") {
|
||||
if z.Next() == ErrorToken {
|
||||
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
|
||||
continue loop
|
||||
}
|
||||
actual := z.Token().String()
|
||||
if s != actual {
|
||||
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
|
||||
continue loop
|
||||
if tt.golden != "" {
|
||||
for i, s := range strings.Split(tt.golden, "$") {
|
||||
if z.Next() == ErrorToken {
|
||||
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
|
||||
continue loop
|
||||
}
|
||||
actual := z.Token().String()
|
||||
if s != actual {
|
||||
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
z.Next()
|
||||
|
Loading…
Reference in New Issue
Block a user