mirror of
https://github.com/golang/go
synced 2024-11-21 21:14:47 -07:00
html: parse malformed tags missing a '>', such as <p id=0</p>
.
The additional token_test.go cases matches html5lib behavior. Fixes #2124. R=gri CC=golang-dev https://golang.org/cl/4844055
This commit is contained in:
parent
1ac7a69701
commit
37afff2978
@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() {
|
|||||||
if z.err != nil {
|
if z.err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
var tt TokenType
|
|
||||||
switch {
|
switch {
|
||||||
case c == '/':
|
case c == '/':
|
||||||
tt = EndTagToken
|
z.tt = EndTagToken
|
||||||
// Lower-cased characters are more common in tag names, so we check for them first.
|
// Lower-cased characters are more common in tag names, so we check for them first.
|
||||||
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
|
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
|
||||||
tt = StartTagToken
|
z.tt = StartTagToken
|
||||||
case c == '!':
|
case c == '!':
|
||||||
z.nextMarkupDeclaration()
|
z.nextMarkupDeclaration()
|
||||||
return
|
return
|
||||||
@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
case '>':
|
case '>':
|
||||||
z.tt = tt
|
if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
|
||||||
if z.buf[z.p1-2] == '/' && tt == StartTagToken {
|
|
||||||
z.tt = SelfClosingTagToken
|
z.tt = SelfClosingTagToken
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int {
|
|||||||
return k
|
return k
|
||||||
}
|
}
|
||||||
|
|
||||||
// word finds the largest alphabetic [0-9A-Za-z]* word at the start
|
// tagName finds the tag name at the start of z.buf[i:] and returns that name
|
||||||
// of z.buf[i:] and returns that word (optionally lower-cased), as
|
// lower-cased, as well as the trimmed cursor location afterwards.
|
||||||
// well as the trimmed cursor location after that word.
|
func (z *Tokenizer) tagName(i int) ([]byte, int) {
|
||||||
func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
|
|
||||||
i0 := i
|
i0 := i
|
||||||
loop:
|
loop:
|
||||||
for ; i < z.p1; i++ {
|
for ; i < z.p1; i++ {
|
||||||
c := z.buf[i]
|
c := z.buf[i]
|
||||||
switch {
|
switch c {
|
||||||
case '0' <= c && c <= '9':
|
case ' ', '\n', '\t', '\f', '/', '>':
|
||||||
// No-op.
|
|
||||||
case 'A' <= c && c <= 'Z':
|
|
||||||
if lower {
|
|
||||||
z.buf[i] = c + 'a' - 'A'
|
|
||||||
}
|
|
||||||
case 'a' <= c && c <= 'z':
|
|
||||||
// No-op.
|
|
||||||
default:
|
|
||||||
break loop
|
break loop
|
||||||
}
|
}
|
||||||
|
if 'A' <= c && c <= 'Z' {
|
||||||
|
z.buf[i] = c + 'a' - 'A'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return z.buf[i0:i], z.trim(i)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
|
||||||
|
// and returns that value, as well as the trimmed cursor location afterwards.
|
||||||
|
func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
|
||||||
|
i0 := i
|
||||||
|
loop:
|
||||||
|
for ; i < z.p1; i++ {
|
||||||
|
switch z.buf[i] {
|
||||||
|
case ' ', '\n', '\t', '\f', '>':
|
||||||
|
break loop
|
||||||
|
case '&':
|
||||||
|
// TODO: unescape the entity.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return z.buf[i0:i], z.trim(i)
|
return z.buf[i0:i], z.trim(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
// attrName finds the largest attribute name at the start
|
// attrName finds the largest attribute name at the start
|
||||||
// of z.buf[i:] and returns it lower-cased, as well
|
// of z.buf[i:] and returns it lower-cased, as well
|
||||||
// as the trimmed cursor location after that word.
|
// as the trimmed cursor location after that name.
|
||||||
//
|
//
|
||||||
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
|
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
|
||||||
// TODO: unicode characters
|
// TODO: unicode characters
|
||||||
func (z *Tokenizer) attrName(i int) ([]byte, int) {
|
func (z *Tokenizer) attrName(i int) ([]byte, int) {
|
||||||
|
for z.buf[i] == '/' {
|
||||||
|
i++
|
||||||
|
if z.buf[i] == '>' {
|
||||||
|
return nil, z.trim(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
i0 := i
|
i0 := i
|
||||||
loop:
|
loop:
|
||||||
for ; i < z.p1; i++ {
|
for ; i < z.p1; i++ {
|
||||||
@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
|
|||||||
if z.buf[i] == '/' {
|
if z.buf[i] == '/' {
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
name, z.p0 = z.word(i, true)
|
name, z.p0 = z.tagName(i)
|
||||||
hasAttr = z.p0 != z.p1
|
hasAttr = z.p0 != z.p1
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
|
|||||||
}
|
}
|
||||||
closeQuote := z.buf[i]
|
closeQuote := z.buf[i]
|
||||||
if closeQuote != '\'' && closeQuote != '"' {
|
if closeQuote != '\'' && closeQuote != '"' {
|
||||||
val, z.p0 = z.word(i, false)
|
val, z.p0 = z.unquotedAttrVal(i)
|
||||||
moreAttr = z.p0 != z.p1
|
moreAttr = z.p0 != z.p1
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -41,6 +41,22 @@ var tokenTests = []tokenTest{
|
|||||||
"<a>b<c/>d</e>",
|
"<a>b<c/>d</e>",
|
||||||
"<a>$b$<c/>$d$</e>",
|
"<a>$b$<c/>$d$</e>",
|
||||||
},
|
},
|
||||||
|
// Some malformed tags that are missing a '>'.
|
||||||
|
{
|
||||||
|
"malformed tag #0",
|
||||||
|
`<p</p>`,
|
||||||
|
`<p< p="">`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"malformed tag #1",
|
||||||
|
`<p id=0</p>`,
|
||||||
|
`<p id="0</p">`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"malformed tag #2",
|
||||||
|
`<p id="0</p>`,
|
||||||
|
`<p id="0</p>">`,
|
||||||
|
},
|
||||||
// Comments.
|
// Comments.
|
||||||
{
|
{
|
||||||
"comment0",
|
"comment0",
|
||||||
@ -117,7 +133,6 @@ var tokenTests = []tokenTest{
|
|||||||
"½",
|
"½",
|
||||||
"½",
|
"½",
|
||||||
},
|
},
|
||||||
|
|
||||||
// Attribute tests:
|
// Attribute tests:
|
||||||
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
|
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user