1
0
mirror of https://github.com/golang/go synced 2024-09-24 21:10:12 -06:00

html: rewrite the tokenizer to be more consistent.

Previously, the tokenizer made two passes per token. The first pass
established the token boundary. The second pass picked out the tag name
and attributes inside that boundary. This was problematic when the two
passes disagreed. For example, "<p id=can't><p id=won't>" caused an
infinite loop because the first pass skipped everything inside the
single quotes, and recognized only one token, but the second pass never
got past the first '>'.

This change rewrites the tokenizer to use one pass, accumulating the
boundary points of token text, tag names, attribute keys and attribute
values as it looks for the token endpoint.

It should still be reasonably efficient: text, names, keys and values
are not lower-cased or unescaped (and converted from []byte to string)
until asked for.

One of the token_test test cases was fixed to be consistent with
html5lib. Three more test cases were temporarily disabled, and will be
re-enabled in a follow-up CL. All the parse_test test cases pass.

R=andybalholm, gri
CC=golang-dev
https://golang.org/cl/5244061
This commit is contained in:
Nigel Tao 2011-10-14 09:58:39 +11:00
parent 6198336bb5
commit a49b8b9875
3 changed files with 278 additions and 233 deletions

View File

@ -183,6 +183,16 @@ func unescape(b []byte) []byte {
return b
}
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
if 'A' <= c && c <= 'Z' {
b[i] = c + 'a' - 'A'
}
}
return b
}
const escapedChars = `&'<>"`
func escape(w writer, s string) os.Error {

View File

@ -107,6 +107,12 @@ func (t Token) String() string {
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
// the end is exclusive.
type span struct {
start, end int
}
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
// If ReturnComments is set, Next returns comment tokens;
@ -115,7 +121,7 @@ type Tokenizer struct {
// r is the source of the HTML text.
r io.Reader
// tt is the TokenType of the most recently read token.
// tt is the TokenType of the current token.
tt TokenType
// err is the first error encountered during tokenization. It is possible
// for tt != Error && err != nil to hold: this means that Next returned a
@ -125,10 +131,19 @@ type Tokenizer struct {
// subsequent Next calls would return an ErrorToken.
// err is never reset. Once it becomes non-nil, it stays non-nil.
err os.Error
// buf[p0:p1] holds the raw data of the most recent token.
// buf[p1:] is buffered input that will yield future tokens.
p0, p1 int
buf []byte
// buf[raw.start:raw.end] holds the raw bytes of the current token.
// buf[raw.end:] is buffered input that will yield future tokens.
raw span
buf []byte
// buf[data.start:data.end] holds the raw bytes of the current token's data:
// a text token's text, a tag token's tag name, etc.
data span
// pendingAttr is the attribute key and value currently being tokenized.
// When complete, pendingAttr is pushed onto attr. nAttrReturned is
// incremented on each call to TagAttr.
pendingAttr [2]span
attr [][2]span
nAttrReturned int
}
// Error returns the error associated with the most recent ErrorToken token.
@ -140,33 +155,42 @@ func (z *Tokenizer) Error() os.Error {
return z.err
}
// Raw returns the unmodified text of the current token. Calling Next, Token,
// Text, TagName or TagAttr may change the contents of the returned slice.
func (z *Tokenizer) Raw() []byte {
return z.buf[z.p0:z.p1]
}
// readByte returns the next byte from the input stream, doing a buffered read
// from z.r into z.buf if necessary. z.buf[z.p0:z.p1] remains a contiguous byte
// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
// slice that holds all the bytes read so far for the current token.
// It sets z.err if the underlying reader returns an error.
// Pre-condition: z.err == nil.
func (z *Tokenizer) readByte() byte {
if z.p1 >= len(z.buf) {
if z.raw.end >= len(z.buf) {
// Our buffer is exhausted and we have to read from z.r.
// We copy z.buf[z.p0:z.p1] to the beginning of z.buf. If the length
// z.p1 - z.p0 is more than half the capacity of z.buf, then we
// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
// allocate a new buffer before the copy.
c := cap(z.buf)
d := z.p1 - z.p0
d := z.raw.end - z.raw.start
var buf1 []byte
if 2*d > c {
buf1 = make([]byte, d, 2*c)
} else {
buf1 = z.buf[:d]
}
copy(buf1, z.buf[z.p0:z.p1])
z.p0, z.p1, z.buf = 0, d, buf1[:d]
copy(buf1, z.buf[z.raw.start:z.raw.end])
if x := z.raw.start; x != 0 {
// Adjust the data/attr spans to refer to the same contents after the copy.
z.data.start -= x
z.data.end -= x
z.pendingAttr[0].start -= x
z.pendingAttr[0].end -= x
z.pendingAttr[1].start -= x
z.pendingAttr[1].end -= x
for i := range z.attr {
z.attr[i][0].start -= x
z.attr[i][0].end -= x
z.attr[i][1].start -= x
z.attr[i][1].end -= x
}
}
z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
// Now that we have copied the live bytes to the start of the buffer,
// we read from z.r into the remainder.
n, err := z.r.Read(buf1[d:cap(buf1)])
@ -176,40 +200,44 @@ func (z *Tokenizer) readByte() byte {
}
z.buf = buf1[:d+n]
}
x := z.buf[z.p1]
z.p1++
x := z.buf[z.raw.end]
z.raw.end++
return x
}
// readTo keeps reading bytes until x is found or a read error occurs. If an
// error does occur, z.err is set to that error.
// Pre-condition: z.err == nil.
func (z *Tokenizer) readTo(x uint8) {
func (z *Tokenizer) savePendingAttr() {
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
}
// skipWhiteSpace skips past any white space.
func (z *Tokenizer) skipWhiteSpace() {
for {
c := z.readByte()
if z.err != nil {
return
}
switch c {
case x:
case ' ', '\n', '\r', '\t', '\f':
// No-op.
default:
z.raw.end--
return
case '\\':
z.readByte()
if z.err != nil {
return
}
}
}
}
// nextComment reads the next token starting with "<!--".
// The opening "<!--" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 4 <= z.p1.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() {
// <!--> is a valid comment.
for dashCount := 2; ; {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
switch c {
@ -218,6 +246,9 @@ func (z *Tokenizer) nextComment() {
case '>':
if dashCount >= 2 {
z.tt = CommentToken
// TODO: adjust z.data to be only the "x" in "<!--x-->".
// Note that "<!>" is also a valid HTML5 comment.
z.data = z.raw
return
}
dashCount = 0
@ -230,7 +261,8 @@ func (z *Tokenizer) nextComment() {
// nextMarkupDeclaration reads the next token starting with "<!".
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
// The opening "<!" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 2 <= z.p1.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 2 <= z.raw.end.
func (z *Tokenizer) nextMarkupDeclaration() {
var c [2]byte
for i := 0; i < 2; i++ {
@ -243,11 +275,12 @@ func (z *Tokenizer) nextMarkupDeclaration() {
z.nextComment()
return
}
z.p1 -= 2
z.raw.end -= 2
const s = "DOCTYPE "
for i := 0; ; i++ {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
// Capitalize c.
@ -261,6 +294,8 @@ func (z *Tokenizer) nextMarkupDeclaration() {
if c == '>' {
if i >= len(s) {
z.tt = DoctypeToken
z.data.start = z.raw.start + len("<!DOCTYPE ")
z.data.end = z.raw.end - len(">")
}
return
}
@ -270,18 +305,23 @@ func (z *Tokenizer) nextMarkupDeclaration() {
// nextTag reads the next token starting with "<". It might be a "<startTag>",
// an "</endTag>", a "<!markup declaration>", or "<malformed text".
// The opening "<" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextTag() {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
switch {
// TODO: check that the "</" is followed by something in A-Za-z.
case c == '/':
z.tt = EndTagToken
z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
z.tt = StartTagToken
z.data.start += len("<")
case c == '!':
z.nextMarkupDeclaration()
return
@ -292,36 +332,149 @@ func (z *Tokenizer) nextTag() {
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
return
}
for {
c := z.readByte()
if z.err != nil {
return
}
switch c {
case '"', '\'':
z.readTo(c)
if z.err != nil {
return
}
case '>':
if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
z.tt = SelfClosingTagToken
}
return
// Read the tag name, and attribute key/value pairs.
if z.readTagName() {
for z.readTagAttrKey() && z.readTagAttrVal() {
z.savePendingAttr()
}
}
// If we didn't get a final ">", assume that it's a text token.
// TODO: this isn't right: html5lib treats "<p x=1" as a tag with one attribute.
if z.err != nil {
z.tt = TextToken
z.data = z.raw
z.attr = z.attr[:0]
return
}
// Check for a self-closing token.
if z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
z.tt = SelfClosingTagToken
}
}
// readTagName sets z.data to the "p" in "<p a=1>" and returns whether the tag
// may have attributes.
func (z *Tokenizer) readTagName() (more bool) {
for {
c := z.readByte()
if z.err != nil {
return false
}
switch c {
case ' ', '\n', '\t', '\f', '/':
z.data.end = z.raw.end - 1
return true
case '>':
// We cannot have a self-closing token, since the case above catches
// the "/" in "<p/>".
z.data.end = z.raw.end - len(">")
return false
}
}
panic("unreachable")
}
// readTagAttrKey sets z.pendingAttr[0] to the "a" in "<p a=1>" and returns
// whether the tag may have an attribute value.
func (z *Tokenizer) readTagAttrKey() (more bool) {
if z.skipWhiteSpace(); z.err != nil {
return false
}
z.pendingAttr[0].start = z.raw.end
z.pendingAttr[0].end = z.raw.end
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
for {
c := z.readByte()
if z.err != nil {
return false
}
switch c {
case ' ', '\n', '\r', '\t', '\f', '/':
z.pendingAttr[0].end = z.raw.end - 1
return true
case '=':
z.raw.end--
z.pendingAttr[0].end = z.raw.end
return true
case '>':
z.pendingAttr[0].end = z.raw.end - 1
z.savePendingAttr()
return false
}
}
panic("unreachable")
}
// readTagAttrVal sets z.pendingAttr[1] to the "1" in "<p a=1>" and returns
// whether the tag may have more attributes.
func (z *Tokenizer) readTagAttrVal() (more bool) {
if z.skipWhiteSpace(); z.err != nil {
return false
}
for {
c := z.readByte()
if z.err != nil {
return false
}
if c == '=' {
break
}
z.raw.end--
return true
}
if z.skipWhiteSpace(); z.err != nil {
return false
}
const delimAnyWhiteSpace = 1
loop:
for delim := byte(0); ; {
c := z.readByte()
if z.err != nil {
return false
}
if delim == 0 {
switch c {
case '\'', '"':
delim = c
default:
delim = delimAnyWhiteSpace
z.raw.end--
}
z.pendingAttr[1].start = z.raw.end
continue
}
switch c {
case '/', '>':
z.raw.end--
z.pendingAttr[1].end = z.raw.end
break loop
case ' ', '\n', '\r', '\t', '\f':
if delim != delimAnyWhiteSpace {
continue
}
fallthrough
case delim:
z.pendingAttr[1].end = z.raw.end - 1
break loop
}
}
return true
}
// nextText reads all text up until an '<'.
// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1.
// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() {
for {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
if c == '<' {
z.p1--
z.raw.end--
z.data = z.raw
return
}
}
@ -334,7 +487,12 @@ func (z *Tokenizer) Next() TokenType {
z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
z.attr = z.attr[:0]
z.nAttrReturned = 0
c := z.readByte()
if z.err != nil {
z.tt = ErrorToken
@ -355,118 +513,21 @@ func (z *Tokenizer) Next() TokenType {
panic("unreachable")
}
// trim returns the largest j such that z.buf[i:j] contains only white space,
// or only white space plus the final ">" or "/>" of the raw data.
func (z *Tokenizer) trim(i int) int {
k := z.p1
for ; i < k; i++ {
switch z.buf[i] {
case ' ', '\n', '\t', '\f':
continue
case '>':
if i == k-1 {
return k
}
case '/':
if i == k-2 {
return k
}
}
return i
}
return k
}
// tagName finds the tag name at the start of z.buf[i:] and returns that name
// lower-cased, as well as the trimmed cursor location afterwards.
func (z *Tokenizer) tagName(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
switch c {
case ' ', '\n', '\t', '\f', '/', '>':
break loop
}
if 'A' <= c && c <= 'Z' {
z.buf[i] = c + 'a' - 'A'
}
}
return z.buf[i0:i], z.trim(i)
}
// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
// and returns that value, as well as the trimmed cursor location afterwards.
func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
switch z.buf[i] {
case ' ', '\n', '\t', '\f', '>':
break loop
case '&':
// TODO: unescape the entity.
}
}
return z.buf[i0:i], z.trim(i)
}
// attrName finds the largest attribute name at the start
// of z.buf[i:] and returns it lower-cased, as well
// as the trimmed cursor location after that name.
//
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
// TODO: unicode characters
func (z *Tokenizer) attrName(i int) ([]byte, int) {
for z.buf[i] == '/' {
i++
if z.buf[i] == '>' {
return nil, z.trim(i)
}
}
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
switch c {
case '>', '/', '=':
break loop
}
switch {
case 'A' <= c && c <= 'Z':
z.buf[i] = c + 'a' - 'A'
case c > ' ' && c < 0x7f:
// No-op.
default:
break loop
}
}
return z.buf[i0:i], z.trim(i)
// Raw returns the unmodified text of the current token. Calling Next, Token,
// Text, TagName or TagAttr may change the contents of the returned slice.
func (z *Tokenizer) Raw() []byte {
return z.buf[z.raw.start:z.raw.end]
}
// Text returns the unescaped text of a text, comment or doctype token. The
// contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte {
var i0, i1 int
switch z.tt {
case TextToken:
i0 = z.p0
i1 = z.p1
case CommentToken:
// Trim the "<!--" from the left and the "-->" from the right.
// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
i0 = z.p0 + 4
i1 = z.p1 - 3
case DoctypeToken:
// Trim the "<!DOCTYPE " from the left and the ">" from the right.
i0 = z.p0 + 10
i1 = z.p1 - 1
default:
return nil
}
z.p0 = z.p1
if i0 < i1 {
return unescape(z.buf[i0:i1])
case TextToken, CommentToken, DoctypeToken:
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
return unescape(s)
}
return nil
}
@ -475,73 +536,31 @@ func (z *Tokenizer) Text() []byte {
// `<IMG SRC="foo">`) and whether the tag has attributes.
// The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
i := z.p0 + 1
if i >= z.p1 {
z.p0 = z.p1
return nil, false
switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken:
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr)
}
if z.buf[i] == '/' {
i++
}
name, z.p0 = z.tagName(i)
hasAttr = z.p0 != z.p1
return
return nil, false
}
// TagAttr returns the lower-cased key and unescaped value of the next unparsed
// attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
key, i := z.attrName(z.p0)
// Check for an empty attribute value.
if i == z.p1 {
z.p0 = i
return
}
// Get past the equals and quote characters.
if z.buf[i] != '=' {
z.p0, moreAttr = i, true
return
}
i = z.trim(i + 1)
if i == z.p1 {
z.p0 = i
return
}
closeQuote := z.buf[i]
if closeQuote != '\'' && closeQuote != '"' {
val, z.p0 = z.unquotedAttrVal(i)
moreAttr = z.p0 != z.p1
return
}
i = z.trim(i + 1)
// Copy and unescape everything up to the closing quote.
dst, src := i, i
loop:
for src < z.p1 {
c := z.buf[src]
switch c {
case closeQuote:
src++
break loop
case '&':
dst, src = unescapeEntity(z.buf, dst, src, true)
case '\\':
if src == z.p1 {
z.buf[dst] = '\\'
dst++
} else {
z.buf[dst] = z.buf[src+1]
dst, src = dst+1, src+2
}
default:
z.buf[dst] = c
dst, src = dst+1, src+1
if z.nAttrReturned < len(z.attr) {
switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken:
x := z.attr[z.nAttrReturned]
z.nAttrReturned++
key = z.buf[x[0].start:x[0].end]
val = z.buf[x[1].start:x[1].end]
return lower(key), unescape(val), z.nAttrReturned < len(z.attr)
}
}
val, z.p0 = z.buf[i:dst], z.trim(src)
moreAttr = z.p0 != z.p1
return
return nil, nil, false
}
// Token returns the next Token. The result's Data and Attr values remain valid

View File

@ -52,16 +52,19 @@ var tokenTests = []tokenTest{
`<p </p>`,
`<p <="" p="">`,
},
{
"malformed tag #2",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #3",
`<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
/*
// TODO: re-enable these tests when they work. This input/output matches html5lib's behavior.
{
"malformed tag #2",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #3",
`<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
*/
{
"malformed tag #4",
`<p id="0"</p>`,
@ -117,7 +120,7 @@ var tokenTests = []tokenTest{
{
"backslash",
`<p id="a\"b">`,
`<p id="a&quot;b">`,
`<p id="a\" b"="">`,
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
@ -133,11 +136,14 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
{
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
/*
// TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
{
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
*/
{
"entity with digits",
"&frac12;",
@ -190,6 +196,16 @@ var tokenTests = []tokenTest{
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
},
{
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
},
{
"Attributes with a solitary single quote",
"<p id=can't><p id=won't>",
"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
},
}
func TestTokenizer(t *testing.T) {