1
0
mirror of https://github.com/golang/go synced 2024-11-25 08:57:58 -07:00

html: rewrite the tokenizer to be more consistent.

Previously, the tokenizer made two passes per token. The first pass
established the token boundary. The second pass picked out the tag name
and attributes inside that boundary. This was problematic when the two
passes disagreed. For example, "<p id=can't><p id=won't>" caused an
infinite loop because the first pass skipped everything inside the
single quotes, and recognized only one token, but the second pass never
got past the first '>'.

This change rewrites the tokenizer to use one pass, accumulating the
boundary points of token text, tag names, attribute keys and attribute
values as it looks for the token endpoint.

It should still be reasonably efficient: text, names, keys and values
are not lower-cased or unescaped (and converted from []byte to string)
until asked for.

One of the token_test test cases was fixed to be consistent with
html5lib. Three more test cases were temporarily disabled, and will be
re-enabled in a follow-up CL. All the parse_test test cases pass.

R=andybalholm, gri
CC=golang-dev
https://golang.org/cl/5244061
This commit is contained in:
Nigel Tao 2011-10-14 09:58:39 +11:00
parent 6198336bb5
commit a49b8b9875
3 changed files with 278 additions and 233 deletions

View File

@ -183,6 +183,16 @@ func unescape(b []byte) []byte {
return b return b
} }
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
if 'A' <= c && c <= 'Z' {
b[i] = c + 'a' - 'A'
}
}
return b
}
const escapedChars = `&'<>"` const escapedChars = `&'<>"`
func escape(w writer, s string) os.Error { func escape(w writer, s string) os.Error {

View File

@ -107,6 +107,12 @@ func (t Token) String() string {
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
} }
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
// the end is exclusive.
type span struct {
start, end int
}
// A Tokenizer returns a stream of HTML Tokens. // A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct { type Tokenizer struct {
// If ReturnComments is set, Next returns comment tokens; // If ReturnComments is set, Next returns comment tokens;
@ -115,7 +121,7 @@ type Tokenizer struct {
// r is the source of the HTML text. // r is the source of the HTML text.
r io.Reader r io.Reader
// tt is the TokenType of the most recently read token. // tt is the TokenType of the current token.
tt TokenType tt TokenType
// err is the first error encountered during tokenization. It is possible // err is the first error encountered during tokenization. It is possible
// for tt != Error && err != nil to hold: this means that Next returned a // for tt != Error && err != nil to hold: this means that Next returned a
@ -125,10 +131,19 @@ type Tokenizer struct {
// subsequent Next calls would return an ErrorToken. // subsequent Next calls would return an ErrorToken.
// err is never reset. Once it becomes non-nil, it stays non-nil. // err is never reset. Once it becomes non-nil, it stays non-nil.
err os.Error err os.Error
// buf[p0:p1] holds the raw data of the most recent token. // buf[raw.start:raw.end] holds the raw bytes of the current token.
// buf[p1:] is buffered input that will yield future tokens. // buf[raw.end:] is buffered input that will yield future tokens.
p0, p1 int raw span
buf []byte buf []byte
// buf[data.start:data.end] holds the raw bytes of the current token's data:
// a text token's text, a tag token's tag name, etc.
data span
// pendingAttr is the attribute key and value currently being tokenized.
// When complete, pendingAttr is pushed onto attr. nAttrReturned is
// incremented on each call to TagAttr.
pendingAttr [2]span
attr [][2]span
nAttrReturned int
} }
// Error returns the error associated with the most recent ErrorToken token. // Error returns the error associated with the most recent ErrorToken token.
@ -140,33 +155,42 @@ func (z *Tokenizer) Error() os.Error {
return z.err return z.err
} }
// Raw returns the unmodified text of the current token. Calling Next, Token,
// Text, TagName or TagAttr may change the contents of the returned slice.
func (z *Tokenizer) Raw() []byte {
return z.buf[z.p0:z.p1]
}
// readByte returns the next byte from the input stream, doing a buffered read // readByte returns the next byte from the input stream, doing a buffered read
// from z.r into z.buf if necessary. z.buf[z.p0:z.p1] remains a contiguous byte // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
// slice that holds all the bytes read so far for the current token. // slice that holds all the bytes read so far for the current token.
// It sets z.err if the underlying reader returns an error. // It sets z.err if the underlying reader returns an error.
// Pre-condition: z.err == nil. // Pre-condition: z.err == nil.
func (z *Tokenizer) readByte() byte { func (z *Tokenizer) readByte() byte {
if z.p1 >= len(z.buf) { if z.raw.end >= len(z.buf) {
// Our buffer is exhausted and we have to read from z.r. // Our buffer is exhausted and we have to read from z.r.
// We copy z.buf[z.p0:z.p1] to the beginning of z.buf. If the length // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
// z.p1 - z.p0 is more than half the capacity of z.buf, then we // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
// allocate a new buffer before the copy. // allocate a new buffer before the copy.
c := cap(z.buf) c := cap(z.buf)
d := z.p1 - z.p0 d := z.raw.end - z.raw.start
var buf1 []byte var buf1 []byte
if 2*d > c { if 2*d > c {
buf1 = make([]byte, d, 2*c) buf1 = make([]byte, d, 2*c)
} else { } else {
buf1 = z.buf[:d] buf1 = z.buf[:d]
} }
copy(buf1, z.buf[z.p0:z.p1]) copy(buf1, z.buf[z.raw.start:z.raw.end])
z.p0, z.p1, z.buf = 0, d, buf1[:d] if x := z.raw.start; x != 0 {
// Adjust the data/attr spans to refer to the same contents after the copy.
z.data.start -= x
z.data.end -= x
z.pendingAttr[0].start -= x
z.pendingAttr[0].end -= x
z.pendingAttr[1].start -= x
z.pendingAttr[1].end -= x
for i := range z.attr {
z.attr[i][0].start -= x
z.attr[i][0].end -= x
z.attr[i][1].start -= x
z.attr[i][1].end -= x
}
}
z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
// Now that we have copied the live bytes to the start of the buffer, // Now that we have copied the live bytes to the start of the buffer,
// we read from z.r into the remainder. // we read from z.r into the remainder.
n, err := z.r.Read(buf1[d:cap(buf1)]) n, err := z.r.Read(buf1[d:cap(buf1)])
@ -176,40 +200,44 @@ func (z *Tokenizer) readByte() byte {
} }
z.buf = buf1[:d+n] z.buf = buf1[:d+n]
} }
x := z.buf[z.p1] x := z.buf[z.raw.end]
z.p1++ z.raw.end++
return x return x
} }
// readTo keeps reading bytes until x is found or a read error occurs. If an func (z *Tokenizer) savePendingAttr() {
// error does occur, z.err is set to that error. if z.pendingAttr[0].start != z.pendingAttr[0].end {
// Pre-condition: z.err == nil. z.attr = append(z.attr, z.pendingAttr)
func (z *Tokenizer) readTo(x uint8) { }
}
// skipWhiteSpace skips past any white space.
func (z *Tokenizer) skipWhiteSpace() {
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
return return
} }
switch c { switch c {
case x: case ' ', '\n', '\r', '\t', '\f':
// No-op.
default:
z.raw.end--
return return
case '\\':
z.readByte()
if z.err != nil {
return
}
} }
} }
} }
// nextComment reads the next token starting with "<!--". // nextComment reads the next token starting with "<!--".
// The opening "<!--" has already been consumed. // The opening "<!--" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 4 <= z.p1. // Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() { func (z *Tokenizer) nextComment() {
// <!--> is a valid comment. // <!--> is a valid comment.
for dashCount := 2; ; { for dashCount := 2; ; {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw
return return
} }
switch c { switch c {
@ -218,6 +246,9 @@ func (z *Tokenizer) nextComment() {
case '>': case '>':
if dashCount >= 2 { if dashCount >= 2 {
z.tt = CommentToken z.tt = CommentToken
// TODO: adjust z.data to be only the "x" in "<!--x-->".
// Note that "<!>" is also a valid HTML5 comment.
z.data = z.raw
return return
} }
dashCount = 0 dashCount = 0
@ -230,7 +261,8 @@ func (z *Tokenizer) nextComment() {
// nextMarkupDeclaration reads the next token starting with "<!". // nextMarkupDeclaration reads the next token starting with "<!".
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text". // It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
// The opening "<!" has already been consumed. // The opening "<!" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 2 <= z.p1. // Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 2 <= z.raw.end.
func (z *Tokenizer) nextMarkupDeclaration() { func (z *Tokenizer) nextMarkupDeclaration() {
var c [2]byte var c [2]byte
for i := 0; i < 2; i++ { for i := 0; i < 2; i++ {
@ -243,11 +275,12 @@ func (z *Tokenizer) nextMarkupDeclaration() {
z.nextComment() z.nextComment()
return return
} }
z.p1 -= 2 z.raw.end -= 2
const s = "DOCTYPE " const s = "DOCTYPE "
for i := 0; ; i++ { for i := 0; ; i++ {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw
return return
} }
// Capitalize c. // Capitalize c.
@ -261,6 +294,8 @@ func (z *Tokenizer) nextMarkupDeclaration() {
if c == '>' { if c == '>' {
if i >= len(s) { if i >= len(s) {
z.tt = DoctypeToken z.tt = DoctypeToken
z.data.start = z.raw.start + len("<!DOCTYPE ")
z.data.end = z.raw.end - len(">")
} }
return return
} }
@ -270,18 +305,23 @@ func (z *Tokenizer) nextMarkupDeclaration() {
// nextTag reads the next token starting with "<". It might be a "<startTag>", // nextTag reads the next token starting with "<". It might be a "<startTag>",
// an "</endTag>", a "<!markup declaration>", or "<malformed text". // an "</endTag>", a "<!markup declaration>", or "<malformed text".
// The opening "<" has already been consumed. // The opening "<" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. // Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextTag() { func (z *Tokenizer) nextTag() {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw
return return
} }
switch { switch {
// TODO: check that the "</" is followed by something in A-Za-z.
case c == '/': case c == '/':
z.tt = EndTagToken z.tt = EndTagToken
z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first. // Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
z.tt = StartTagToken z.tt = StartTagToken
z.data.start += len("<")
case c == '!': case c == '!':
z.nextMarkupDeclaration() z.nextMarkupDeclaration()
return return
@ -292,36 +332,149 @@ func (z *Tokenizer) nextTag() {
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
return return
} }
for { // Read the tag name, and attribute key/value pairs.
c := z.readByte() if z.readTagName() {
if z.err != nil { for z.readTagAttrKey() && z.readTagAttrVal() {
return z.savePendingAttr()
}
switch c {
case '"', '\'':
z.readTo(c)
if z.err != nil {
return
}
case '>':
if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
z.tt = SelfClosingTagToken
}
return
} }
} }
// If we didn't get a final ">", assume that it's a text token.
// TODO: this isn't right: html5lib treats "<p x=1" as a tag with one attribute.
if z.err != nil {
z.tt = TextToken
z.data = z.raw
z.attr = z.attr[:0]
return
}
// Check for a self-closing token.
if z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
z.tt = SelfClosingTagToken
}
}
// readTagName sets z.data to the "p" in "<p a=1>" and returns whether the tag
// may have attributes.
func (z *Tokenizer) readTagName() (more bool) {
for {
c := z.readByte()
if z.err != nil {
return false
}
switch c {
case ' ', '\n', '\t', '\f', '/':
z.data.end = z.raw.end - 1
return true
case '>':
// We cannot have a self-closing token, since the case above catches
// the "/" in "<p/>".
z.data.end = z.raw.end - len(">")
return false
}
}
panic("unreachable")
}
// readTagAttrKey sets z.pendingAttr[0] to the "a" in "<p a=1>" and returns
// whether the tag may have an attribute value.
func (z *Tokenizer) readTagAttrKey() (more bool) {
if z.skipWhiteSpace(); z.err != nil {
return false
}
z.pendingAttr[0].start = z.raw.end
z.pendingAttr[0].end = z.raw.end
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
for {
c := z.readByte()
if z.err != nil {
return false
}
switch c {
case ' ', '\n', '\r', '\t', '\f', '/':
z.pendingAttr[0].end = z.raw.end - 1
return true
case '=':
z.raw.end--
z.pendingAttr[0].end = z.raw.end
return true
case '>':
z.pendingAttr[0].end = z.raw.end - 1
z.savePendingAttr()
return false
}
}
panic("unreachable")
}
// readTagAttrVal sets z.pendingAttr[1] to the "1" in "<p a=1>" and returns
// whether the tag may have more attributes.
func (z *Tokenizer) readTagAttrVal() (more bool) {
if z.skipWhiteSpace(); z.err != nil {
return false
}
for {
c := z.readByte()
if z.err != nil {
return false
}
if c == '=' {
break
}
z.raw.end--
return true
}
if z.skipWhiteSpace(); z.err != nil {
return false
}
const delimAnyWhiteSpace = 1
loop:
for delim := byte(0); ; {
c := z.readByte()
if z.err != nil {
return false
}
if delim == 0 {
switch c {
case '\'', '"':
delim = c
default:
delim = delimAnyWhiteSpace
z.raw.end--
}
z.pendingAttr[1].start = z.raw.end
continue
}
switch c {
case '/', '>':
z.raw.end--
z.pendingAttr[1].end = z.raw.end
break loop
case ' ', '\n', '\r', '\t', '\f':
if delim != delimAnyWhiteSpace {
continue
}
fallthrough
case delim:
z.pendingAttr[1].end = z.raw.end - 1
break loop
}
}
return true
} }
// nextText reads all text up until an '<'. // nextText reads all text up until an '<'.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. // Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() { func (z *Tokenizer) nextText() {
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw
return return
} }
if c == '<' { if c == '<' {
z.p1-- z.raw.end--
z.data = z.raw
return return
} }
} }
@ -334,7 +487,12 @@ func (z *Tokenizer) Next() TokenType {
z.tt = ErrorToken z.tt = ErrorToken
return z.tt return z.tt
} }
z.p0 = z.p1 z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
z.attr = z.attr[:0]
z.nAttrReturned = 0
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.tt = ErrorToken z.tt = ErrorToken
@ -355,118 +513,21 @@ func (z *Tokenizer) Next() TokenType {
panic("unreachable") panic("unreachable")
} }
// trim returns the largest j such that z.buf[i:j] contains only white space, // Raw returns the unmodified text of the current token. Calling Next, Token,
// or only white space plus the final ">" or "/>" of the raw data. // Text, TagName or TagAttr may change the contents of the returned slice.
func (z *Tokenizer) trim(i int) int { func (z *Tokenizer) Raw() []byte {
k := z.p1 return z.buf[z.raw.start:z.raw.end]
for ; i < k; i++ {
switch z.buf[i] {
case ' ', '\n', '\t', '\f':
continue
case '>':
if i == k-1 {
return k
}
case '/':
if i == k-2 {
return k
}
}
return i
}
return k
}
// tagName finds the tag name at the start of z.buf[i:] and returns that name
// lower-cased, as well as the trimmed cursor location afterwards.
func (z *Tokenizer) tagName(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
switch c {
case ' ', '\n', '\t', '\f', '/', '>':
break loop
}
if 'A' <= c && c <= 'Z' {
z.buf[i] = c + 'a' - 'A'
}
}
return z.buf[i0:i], z.trim(i)
}
// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
// and returns that value, as well as the trimmed cursor location afterwards.
func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
switch z.buf[i] {
case ' ', '\n', '\t', '\f', '>':
break loop
case '&':
// TODO: unescape the entity.
}
}
return z.buf[i0:i], z.trim(i)
}
// attrName finds the largest attribute name at the start
// of z.buf[i:] and returns it lower-cased, as well
// as the trimmed cursor location after that name.
//
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
// TODO: unicode characters
func (z *Tokenizer) attrName(i int) ([]byte, int) {
for z.buf[i] == '/' {
i++
if z.buf[i] == '>' {
return nil, z.trim(i)
}
}
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
switch c {
case '>', '/', '=':
break loop
}
switch {
case 'A' <= c && c <= 'Z':
z.buf[i] = c + 'a' - 'A'
case c > ' ' && c < 0x7f:
// No-op.
default:
break loop
}
}
return z.buf[i0:i], z.trim(i)
} }
// Text returns the unescaped text of a text, comment or doctype token. The // Text returns the unescaped text of a text, comment or doctype token. The
// contents of the returned slice may change on the next call to Next. // contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte { func (z *Tokenizer) Text() []byte {
var i0, i1 int
switch z.tt { switch z.tt {
case TextToken: case TextToken, CommentToken, DoctypeToken:
i0 = z.p0 s := z.buf[z.data.start:z.data.end]
i1 = z.p1 z.data.start = z.raw.end
case CommentToken: z.data.end = z.raw.end
// Trim the "<!--" from the left and the "-->" from the right. return unescape(s)
// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
i0 = z.p0 + 4
i1 = z.p1 - 3
case DoctypeToken:
// Trim the "<!DOCTYPE " from the left and the ">" from the right.
i0 = z.p0 + 10
i1 = z.p1 - 1
default:
return nil
}
z.p0 = z.p1
if i0 < i1 {
return unescape(z.buf[i0:i1])
} }
return nil return nil
} }
@ -475,73 +536,31 @@ func (z *Tokenizer) Text() []byte {
// `<IMG SRC="foo">`) and whether the tag has attributes. // `<IMG SRC="foo">`) and whether the tag has attributes.
// The contents of the returned slice may change on the next call to Next. // The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
i := z.p0 + 1 switch z.tt {
if i >= z.p1 { case StartTagToken, EndTagToken, SelfClosingTagToken:
z.p0 = z.p1 s := z.buf[z.data.start:z.data.end]
return nil, false z.data.start = z.raw.end
z.data.end = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr)
} }
if z.buf[i] == '/' { return nil, false
i++
}
name, z.p0 = z.tagName(i)
hasAttr = z.p0 != z.p1
return
} }
// TagAttr returns the lower-cased key and unescaped value of the next unparsed // TagAttr returns the lower-cased key and unescaped value of the next unparsed
// attribute for the current tag token and whether there are more attributes. // attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next. // The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
key, i := z.attrName(z.p0) if z.nAttrReturned < len(z.attr) {
// Check for an empty attribute value. switch z.tt {
if i == z.p1 { case StartTagToken, EndTagToken, SelfClosingTagToken:
z.p0 = i x := z.attr[z.nAttrReturned]
return z.nAttrReturned++
} key = z.buf[x[0].start:x[0].end]
// Get past the equals and quote characters. val = z.buf[x[1].start:x[1].end]
if z.buf[i] != '=' { return lower(key), unescape(val), z.nAttrReturned < len(z.attr)
z.p0, moreAttr = i, true
return
}
i = z.trim(i + 1)
if i == z.p1 {
z.p0 = i
return
}
closeQuote := z.buf[i]
if closeQuote != '\'' && closeQuote != '"' {
val, z.p0 = z.unquotedAttrVal(i)
moreAttr = z.p0 != z.p1
return
}
i = z.trim(i + 1)
// Copy and unescape everything up to the closing quote.
dst, src := i, i
loop:
for src < z.p1 {
c := z.buf[src]
switch c {
case closeQuote:
src++
break loop
case '&':
dst, src = unescapeEntity(z.buf, dst, src, true)
case '\\':
if src == z.p1 {
z.buf[dst] = '\\'
dst++
} else {
z.buf[dst] = z.buf[src+1]
dst, src = dst+1, src+2
}
default:
z.buf[dst] = c
dst, src = dst+1, src+1
} }
} }
val, z.p0 = z.buf[i:dst], z.trim(src) return nil, nil, false
moreAttr = z.p0 != z.p1
return
} }
// Token returns the next Token. The result's Data and Attr values remain valid // Token returns the next Token. The result's Data and Attr values remain valid

View File

@ -52,16 +52,19 @@ var tokenTests = []tokenTest{
`<p </p>`, `<p </p>`,
`<p <="" p="">`, `<p <="" p="">`,
}, },
{ /*
"malformed tag #2", // TODO: re-enable these tests when they work. This input/output matches html5lib's behavior.
`<p id=0</p>`, {
`<p id="0&lt;/p">`, "malformed tag #2",
}, `<p id=0</p>`,
{ `<p id="0&lt;/p">`,
"malformed tag #3", },
`<p id="0</p>`, {
`<p id="0&lt;/p&gt;">`, "malformed tag #3",
}, `<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
*/
{ {
"malformed tag #4", "malformed tag #4",
`<p id="0"</p>`, `<p id="0"</p>`,
@ -117,7 +120,7 @@ var tokenTests = []tokenTest{
{ {
"backslash", "backslash",
`<p id="a\"b">`, `<p id="a\"b">`,
`<p id="a&quot;b">`, `<p id="a\" b"="">`,
}, },
// Entities, tag name and attribute key lower-casing, and whitespace // Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag. // normalization within a tag.
@ -133,11 +136,14 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`, `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
}, },
{ /*
"entity without semicolon", // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`, {
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`, "entity without semicolon",
}, `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
*/
{ {
"entity with digits", "entity with digits",
"&frac12;", "&frac12;",
@ -190,6 +196,16 @@ var tokenTests = []tokenTest{
`<meta http-equiv="content-type">`, `<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`, `<meta http-equiv="content-type">`,
}, },
{
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
},
{
"Attributes with a solitary single quote",
"<p id=can't><p id=won't>",
"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
},
} }
func TestTokenizer(t *testing.T) { func TestTokenizer(t *testing.T) {