From 37afff2978f3bb12170076192676e9348a7936fc Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Wed, 10 Aug 2011 13:39:07 +1000 Subject: [PATCH] html: parse malformed tags missing a '>', such as `

`. The additional token_test.go cases matches html5lib behavior. Fixes #2124. R=gri CC=golang-dev https://golang.org/cl/4844055 --- src/pkg/html/token.go | 58 +++++++++++++++++++++++--------------- src/pkg/html/token_test.go | 17 ++++++++++- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index d280ff22560..fddc922d601 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() { if z.err != nil { return } - var tt TokenType switch { case c == '/': - tt = EndTagToken + z.tt = EndTagToken // Lower-cased characters are more common in tag names, so we check for them first. case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': - tt = StartTagToken + z.tt = StartTagToken case c == '!': z.nextMarkupDeclaration() return @@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() { return } case '>': - z.tt = tt - if z.buf[z.p1-2] == '/' && tt == StartTagToken { + if z.buf[z.p1-2] == '/' && z.tt == StartTagToken { z.tt = SelfClosingTagToken } return @@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int { return k } -// word finds the largest alphabetic [0-9A-Za-z]* word at the start -// of z.buf[i:] and returns that word (optionally lower-cased), as -// well as the trimmed cursor location after that word. -func (z *Tokenizer) word(i int, lower bool) ([]byte, int) { +// tagName finds the tag name at the start of z.buf[i:] and returns that name +// lower-cased, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) tagName(i int) ([]byte, int) { i0 := i loop: for ; i < z.p1; i++ { c := z.buf[i] - switch { - case '0' <= c && c <= '9': - // No-op. - case 'A' <= c && c <= 'Z': - if lower { - z.buf[i] = c + 'a' - 'A' - } - case 'a' <= c && c <= 'z': - // No-op. - default: + switch c { + case ' ', '\n', '\t', '\f', '/', '>': break loop } + if 'A' <= c && c <= 'Z' { + z.buf[i] = c + 'a' - 'A' + } + } + return z.buf[i0:i], z.trim(i) +} + +// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:] +// and returns that value, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + switch z.buf[i] { + case ' ', '\n', '\t', '\f', '>': + break loop + case '&': + // TODO: unescape the entity. + } } return z.buf[i0:i], z.trim(i) } // attrName finds the largest attribute name at the start // of z.buf[i:] and returns it lower-cased, as well -// as the trimmed cursor location after that word. +// as the trimmed cursor location after that name. // // http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name // TODO: unicode characters func (z *Tokenizer) attrName(i int) ([]byte, int) { + for z.buf[i] == '/' { + i++ + if z.buf[i] == '>' { + return nil, z.trim(i) + } + } i0 := i loop: for ; i < z.p1; i++ { @@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { if z.buf[i] == '/' { i++ } - name, z.p0 = z.word(i, true) + name, z.p0 = z.tagName(i) hasAttr = z.p0 != z.p1 return } @@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { } closeQuote := z.buf[i] if closeQuote != '\'' && closeQuote != '"' { - val, z.p0 = z.word(i, false) + val, z.p0 = z.unquotedAttrVal(i) moreAttr = z.p0 != z.p1 return } diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index c8dcc88648d..1330f3247a1 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -41,6 +41,22 @@ var tokenTests = []tokenTest{ "bd", "$b$$d$", }, + // Some malformed tags that are missing a '>'. + { + "malformed tag #0", + ``, + ``, + }, + { + "malformed tag #1", + `

`, + `

`, + }, + { + "malformed tag #2", + `

`, + }, // Comments. { "comment0", @@ -117,7 +133,6 @@ var tokenTests = []tokenTest{ "½", "½", }, - // Attribute tests: // http://dev.w3.org/html5/spec/Overview.html#attributes-0 {