From e5f3dc8bc54942db96f55b1b6207edfe69ca4021 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Tue, 18 Oct 2011 09:42:16 +1100 Subject: [PATCH] html: refactor the tokenizer; parse "" correctly. Previously, Next would call either nextText or nextTag, but nextTag could also call nextText. Both nextText and nextTag were responsible for detecting "" and treats it as a comment. -// Pre-condition: z.err == nil && z.raw.end is before the first comment byte. -func (z *Tokenizer) nextBogusComment() { - z.tt = CommentToken +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { z.data.start = z.raw.end for { c := z.readByte() @@ -287,24 +283,22 @@ func (z *Tokenizer) nextBogusComment() { } } -// nextMarkupDeclaration reads the next token starting with "", a "", or "", a "", or "' { - z.data.end = z.raw.end - len(">") - return - } + return DoctypeToken } + z.readUntilCloseAngle() + return DoctypeToken } -// nextTag reads the next token starting with "<". It might be a "", -// an "", a "", or "' { + return + } } } -// readTagName sets z.data to the "p" in "

". +// readTagName sets z.data to the "div" in "

". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "". +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "
". // Precondition: z.err == nil. func (z *Tokenizer) readTagAttrKey() { z.pendingAttr[0].start = z.raw.end @@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() { } } -// readTagAttrVal sets z.pendingAttr[1] to the "v" in "

". +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "

". func (z *Tokenizer) readTagAttrVal() { z.pendingAttr[1].start = z.raw.end z.pendingAttr[1].end = z.raw.end @@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() { } } -// nextText reads all text up until a start tag "", + // "" and "". + tokenType = CommentToken + default: continue } - c = z.readByte() - if z.err != nil { - break + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // "" does not generate a token at all. + // Reset the tokenizer state and start again. + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + continue loop + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readEndTag() + return EndTagToken + } + z.raw.end-- + z.readUntilCloseAngle() + return CommentToken + case CommentToken: + if c == '!' { + return z.readMarkupDeclaration() + } + z.raw.end-- + z.readUntilCloseAngle() + return CommentToken } } - z.data = z.raw + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + return TextToken + } + return ErrorToken } // Next scans the next token and returns its type. func (z *Tokenizer) Next() TokenType { for { - if z.err != nil { - z.tt = ErrorToken - return z.tt - } - z.raw.start = z.raw.end - z.data.start = z.raw.end - z.data.end = z.raw.end - z.attr = z.attr[:0] - z.nAttrReturned = 0 - - c := z.readByte() - if z.err != nil { - z.tt = ErrorToken - return z.tt - } - // We assume that the next token is text unless proven otherwise. - z.tt = TextToken - if c != '<' { - z.nextText() - } else { - z.nextTag() - if z.tt == CommentToken && !z.ReturnComments { - continue - } + z.tt = z.next() + // TODO: remove the ReturnComments option. A tokenizer should + // always return comment tags. + if z.tt == CommentToken && !z.ReturnComments { + continue } return z.tt } @@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte { // ``) and whether the tag has attributes. // The contents of the returned slice may change on the next call to Next. func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { - switch z.tt { - case StartTagToken, EndTagToken, SelfClosingTagToken: - s := z.buf[z.data.start:z.data.end] - z.data.start = z.raw.end - z.data.end = z.raw.end - return lower(s), z.nAttrReturned < len(z.attr) + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } } return nil, false } @@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { if z.nAttrReturned < len(z.attr) { switch z.tt { - case StartTagToken, EndTagToken, SelfClosingTagToken: + case StartTagToken, SelfClosingTagToken: x := z.attr[z.nAttrReturned] z.nAttrReturned++ key = z.buf[x[0].start:x[0].end] @@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token { switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) - case StartTagToken, EndTagToken, SelfClosingTagToken: + case StartTagToken, SelfClosingTagToken: var attr []Attribute name, moreAttr := z.TagName() for moreAttr { @@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token { } t.Data = string(name) t.Attr = attr + case EndTagToken: + name, _ := z.TagName() + t.Data = string(name) } return t } diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index 09bb75be15..2bd87e9129 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -57,19 +57,16 @@ var tokenTests = []tokenTest{ "", - "", - }, - { - "not a tag #3", - "ab", - "a$b", - }, - */ + { + "not a tag #2", + "", + "", + }, + { + "not a tag #3", + "ab", + "a$b", + }, { "not a tag #4", "", @@ -77,21 +74,31 @@ var tokenTests = []tokenTest{ }, { "not a tag #5", + "", + }, + { + "not a tag #6", + "", + "", + }, + { + "not a tag #7", "a < b", "a < b", }, { - "not a tag #6", + "not a tag #8", "<.>", "<.>", }, { - "not a tag #7", + "not a tag #9", "a<<>>c", "a<<$$>>c", }, { - "not a tag #8", + "not a tag #10", "if x<0 and y < 0 then x*y>0", "if x<0 and y < 0 then x*y>0", }, @@ -345,7 +352,7 @@ var tokenTests = []tokenTest{ func TestTokenizer(t *testing.T) { loop: for _, tt := range tokenTests { - z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) + z := NewTokenizer(strings.NewReader(tt.html)) z.ReturnComments = true if tt.golden != "" { for i, s := range strings.Split(tt.golden, "$") {