1
0
mirror of https://github.com/golang/go synced 2024-11-25 01:17:56 -07:00

html: fix some tokenizer bugs with attribute key/values.

The relevant spec sections are 13.2.4.38-13.2.4.40.
http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#attribute-value-(double-quoted)-state

R=andybalholm
CC=golang-dev
https://golang.org/cl/5262044
This commit is contained in:
Nigel Tao 2011-10-14 15:22:02 +11:00
parent 3153395ed0
commit b82a8e7c22
2 changed files with 115 additions and 104 deletions

View File

@ -205,14 +205,11 @@ func (z *Tokenizer) readByte() byte {
return x
}
func (z *Tokenizer) savePendingAttr() {
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
}
// skipWhiteSpace skips past any white space.
func (z *Tokenizer) skipWhiteSpace() {
if z.err != nil {
return
}
for {
c := z.readByte()
if z.err != nil {
@ -332,135 +329,132 @@ func (z *Tokenizer) nextTag() {
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
return
}
// Read the tag name, and attribute key/value pairs.
if z.readTagName() {
for z.readTagAttrKey() && z.readTagAttrVal() {
z.savePendingAttr()
// Read the tag name and attribute key/value pairs.
z.readTagName()
for {
if z.skipWhiteSpace(); z.err != nil {
break
}
c := z.readByte()
if z.err != nil || c == '>' {
break
}
z.raw.end--
z.readTagAttrKey()
z.readTagAttrVal()
// Save pendingAttr if it has a non-empty key.
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
// If we didn't get a final ">", assume that it's a text token.
// TODO: this isn't right: html5lib treats "<p x=1" as a tag with one attribute.
if z.err != nil {
z.tt = TextToken
z.data = z.raw
z.attr = z.attr[:0]
return
}
// Check for a self-closing token.
if z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
z.tt = SelfClosingTagToken
}
}
// readTagName sets z.data to the "p" in "<p a=1>" and returns whether the tag
// may have attributes.
func (z *Tokenizer) readTagName() (more bool) {
// readTagName sets z.data to the "p" in "<p k=v>".
func (z *Tokenizer) readTagName() {
for {
c := z.readByte()
if z.err != nil {
return false
z.data.end = z.raw.end
return
}
switch c {
case ' ', '\n', '\t', '\f', '/':
case ' ', '\n', '\r', '\t', '\f':
z.data.end = z.raw.end - 1
return true
case '>':
// We cannot have a self-closing token, since the case above catches
// the "/" in "<p/>".
z.data.end = z.raw.end - len(">")
return false
return
case '/', '>':
z.raw.end--
z.data.end = z.raw.end
return
}
}
panic("unreachable")
}
// readTagAttrKey sets z.pendingAttr[0] to the "a" in "<p a=1>" and returns
// whether the tag may have an attribute value.
func (z *Tokenizer) readTagAttrKey() (more bool) {
if z.skipWhiteSpace(); z.err != nil {
return false
}
// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<p k=v>".
// Precondition: z.err == nil.
func (z *Tokenizer) readTagAttrKey() {
z.pendingAttr[0].start = z.raw.end
z.pendingAttr[0].end = z.raw.end
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
for {
c := z.readByte()
if z.err != nil {
return false
z.pendingAttr[0].end = z.raw.end
return
}
switch c {
case ' ', '\n', '\r', '\t', '\f', '/':
z.pendingAttr[0].end = z.raw.end - 1
return true
case '=':
return
case '=', '>':
z.raw.end--
z.pendingAttr[0].end = z.raw.end
return true
case '>':
z.pendingAttr[0].end = z.raw.end - 1
z.savePendingAttr()
return false
return
}
}
panic("unreachable")
}
// readTagAttrVal sets z.pendingAttr[1] to the "1" in "<p a=1>" and returns
// whether the tag may have more attributes.
func (z *Tokenizer) readTagAttrVal() (more bool) {
// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<p k=v>".
func (z *Tokenizer) readTagAttrVal() {
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
if z.skipWhiteSpace(); z.err != nil {
return false
return
}
c := z.readByte()
if z.err != nil {
return
}
if c != '=' {
z.raw.end--
return
}
if z.skipWhiteSpace(); z.err != nil {
return
}
quote := z.readByte()
if z.err != nil {
return
}
switch quote {
case '>':
z.raw.end--
return
case '\'', '"':
z.pendingAttr[1].start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
return false
z.pendingAttr[1].end = z.raw.end
return
}
if c == '=' {
break
if c == quote {
z.pendingAttr[1].end = z.raw.end - 1
return
}
z.raw.end--
return true
}
if z.skipWhiteSpace(); z.err != nil {
return false
}
const delimAnyWhiteSpace = 1
loop:
for delim := byte(0); ; {
default:
z.pendingAttr[1].start = z.raw.end - 1
for {
c := z.readByte()
if z.err != nil {
return false
}
if delim == 0 {
switch c {
case '\'', '"':
delim = c
default:
delim = delimAnyWhiteSpace
z.raw.end--
}
z.pendingAttr[1].start = z.raw.end
continue
z.pendingAttr[1].end = z.raw.end
return
}
switch c {
case '/', '>':
case ' ', '\n', '\r', '\t', '\f':
z.pendingAttr[1].end = z.raw.end - 1
return
case '>':
z.raw.end--
z.pendingAttr[1].end = z.raw.end
break loop
case ' ', '\n', '\r', '\t', '\f':
if delim != delimAnyWhiteSpace {
continue
}
fallthrough
case delim:
z.pendingAttr[1].end = z.raw.end - 1
break loop
return
}
}
}
return true
}
// nextText reads all text up until an '<'.

View File

@ -52,21 +52,38 @@ var tokenTests = []tokenTest{
`<p </p>`,
`<p <="" p="">`,
},
/*
// TODO: re-enable these tests when they work. This input/output matches html5lib's behavior.
{
"malformed tag #2",
`<p id`,
`<p id="">`,
},
{
"malformed tag #3",
`<p id=`,
`<p id="">`,
},
{
"malformed tag #4",
`<p id=>`,
`<p id="">`,
},
{
"malformed tag #5",
`<p id=0`,
`<p id="0">`,
},
{
"malformed tag #6",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #3",
"malformed tag #7",
`<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
*/
{
"malformed tag #4",
"malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},