1
0
mirror of https://github.com/golang/go synced 2024-11-25 03:27:58 -07:00

html: fix some tokenizer bugs with attribute key/values.

The relevant spec sections are 13.2.4.38-13.2.4.40.
http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#attribute-value-(double-quoted)-state

R=andybalholm
CC=golang-dev
https://golang.org/cl/5262044
This commit is contained in:
Nigel Tao 2011-10-14 15:22:02 +11:00
parent 3153395ed0
commit b82a8e7c22
2 changed files with 115 additions and 104 deletions

View File

@ -205,14 +205,11 @@ func (z *Tokenizer) readByte() byte {
return x return x
} }
func (z *Tokenizer) savePendingAttr() {
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
}
// skipWhiteSpace skips past any white space. // skipWhiteSpace skips past any white space.
func (z *Tokenizer) skipWhiteSpace() { func (z *Tokenizer) skipWhiteSpace() {
if z.err != nil {
return
}
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
@ -332,135 +329,132 @@ func (z *Tokenizer) nextTag() {
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
return return
} }
// Read the tag name, and attribute key/value pairs. // Read the tag name and attribute key/value pairs.
if z.readTagName() { z.readTagName()
for z.readTagAttrKey() && z.readTagAttrVal() { for {
z.savePendingAttr() if z.skipWhiteSpace(); z.err != nil {
break
} }
c := z.readByte()
if z.err != nil || c == '>' {
break
}
z.raw.end--
z.readTagAttrKey()
z.readTagAttrVal()
// Save pendingAttr if it has a non-empty key.
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
} }
// If we didn't get a final ">", assume that it's a text token.
// TODO: this isn't right: html5lib treats "<p x=1" as a tag with one attribute.
if z.err != nil {
z.tt = TextToken
z.data = z.raw
z.attr = z.attr[:0]
return
} }
// Check for a self-closing token. // Check for a self-closing token.
if z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' { if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
z.tt = SelfClosingTagToken z.tt = SelfClosingTagToken
} }
} }
// readTagName sets z.data to the "p" in "<p a=1>" and returns whether the tag // readTagName sets z.data to the "p" in "<p k=v>".
// may have attributes. func (z *Tokenizer) readTagName() {
func (z *Tokenizer) readTagName() (more bool) {
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
return false z.data.end = z.raw.end
return
} }
switch c { switch c {
case ' ', '\n', '\t', '\f', '/': case ' ', '\n', '\r', '\t', '\f':
z.data.end = z.raw.end - 1 z.data.end = z.raw.end - 1
return true return
case '>': case '/', '>':
// We cannot have a self-closing token, since the case above catches z.raw.end--
// the "/" in "<p/>". z.data.end = z.raw.end
z.data.end = z.raw.end - len(">") return
return false
} }
} }
panic("unreachable")
} }
// readTagAttrKey sets z.pendingAttr[0] to the "a" in "<p a=1>" and returns // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<p k=v>".
// whether the tag may have an attribute value. // Precondition: z.err == nil.
func (z *Tokenizer) readTagAttrKey() (more bool) { func (z *Tokenizer) readTagAttrKey() {
if z.skipWhiteSpace(); z.err != nil {
return false
}
z.pendingAttr[0].start = z.raw.end z.pendingAttr[0].start = z.raw.end
z.pendingAttr[0].end = z.raw.end
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
return false z.pendingAttr[0].end = z.raw.end
return
} }
switch c { switch c {
case ' ', '\n', '\r', '\t', '\f', '/': case ' ', '\n', '\r', '\t', '\f', '/':
z.pendingAttr[0].end = z.raw.end - 1 z.pendingAttr[0].end = z.raw.end - 1
return true return
case '=': case '=', '>':
z.raw.end-- z.raw.end--
z.pendingAttr[0].end = z.raw.end z.pendingAttr[0].end = z.raw.end
return true return
case '>':
z.pendingAttr[0].end = z.raw.end - 1
z.savePendingAttr()
return false
} }
} }
panic("unreachable")
} }
// readTagAttrVal sets z.pendingAttr[1] to the "1" in "<p a=1>" and returns // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<p k=v>".
// whether the tag may have more attributes. func (z *Tokenizer) readTagAttrVal() {
func (z *Tokenizer) readTagAttrVal() (more bool) { z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
if z.skipWhiteSpace(); z.err != nil { if z.skipWhiteSpace(); z.err != nil {
return false return
} }
c := z.readByte()
if z.err != nil {
return
}
if c != '=' {
z.raw.end--
return
}
if z.skipWhiteSpace(); z.err != nil {
return
}
quote := z.readByte()
if z.err != nil {
return
}
switch quote {
case '>':
z.raw.end--
return
case '\'', '"':
z.pendingAttr[1].start = z.raw.end
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
return false z.pendingAttr[1].end = z.raw.end
return
} }
if c == '=' { if c == quote {
break z.pendingAttr[1].end = z.raw.end - 1
return
} }
z.raw.end--
return true
}
if z.skipWhiteSpace(); z.err != nil {
return false
} }
const delimAnyWhiteSpace = 1 default:
loop: z.pendingAttr[1].start = z.raw.end - 1
for delim := byte(0); ; { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
return false z.pendingAttr[1].end = z.raw.end
} return
if delim == 0 {
switch c {
case '\'', '"':
delim = c
default:
delim = delimAnyWhiteSpace
z.raw.end--
}
z.pendingAttr[1].start = z.raw.end
continue
} }
switch c { switch c {
case '/', '>': case ' ', '\n', '\r', '\t', '\f':
z.pendingAttr[1].end = z.raw.end - 1
return
case '>':
z.raw.end-- z.raw.end--
z.pendingAttr[1].end = z.raw.end z.pendingAttr[1].end = z.raw.end
break loop return
case ' ', '\n', '\r', '\t', '\f': }
if delim != delimAnyWhiteSpace {
continue
}
fallthrough
case delim:
z.pendingAttr[1].end = z.raw.end - 1
break loop
} }
} }
return true
} }
// nextText reads all text up until an '<'. // nextText reads all text up until an '<'.

View File

@ -52,21 +52,38 @@ var tokenTests = []tokenTest{
`<p </p>`, `<p </p>`,
`<p <="" p="">`, `<p <="" p="">`,
}, },
/*
// TODO: re-enable these tests when they work. This input/output matches html5lib's behavior.
{ {
"malformed tag #2", "malformed tag #2",
`<p id`,
`<p id="">`,
},
{
"malformed tag #3",
`<p id=`,
`<p id="">`,
},
{
"malformed tag #4",
`<p id=>`,
`<p id="">`,
},
{
"malformed tag #5",
`<p id=0`,
`<p id="0">`,
},
{
"malformed tag #6",
`<p id=0</p>`, `<p id=0</p>`,
`<p id="0&lt;/p">`, `<p id="0&lt;/p">`,
}, },
{ {
"malformed tag #3", "malformed tag #7",
`<p id="0</p>`, `<p id="0</p>`,
`<p id="0&lt;/p&gt;">`, `<p id="0&lt;/p&gt;">`,
}, },
*/
{ {
"malformed tag #4", "malformed tag #8",
`<p id="0"</p>`, `<p id="0"</p>`,
`<p id="0" <="" p="">`, `<p id="0" <="" p="">`,
}, },