From b770c9e9a2d2b19af317924f015467626be7b5a4 Mon Sep 17 00:00:00 2001 From: Andrew Balholm Date: Sat, 15 Oct 2011 12:22:08 +1100 Subject: [PATCH] html: improve parsing of comments and "bogus comments" R=nigeltao CC=golang-dev https://golang.org/cl/5279044 --- src/pkg/html/token.go | 111 +++++++++++++++++++++++++++---------- src/pkg/html/token_test.go | 56 ++++++++++++++++--- 2 files changed, 129 insertions(+), 38 deletions(-) diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index dcece8cacb4..a02b968dc28 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -100,9 +100,9 @@ func (t Token) String() string { case SelfClosingTagToken: return "<" + t.tagString() + "/>" case CommentToken: - return "" + return "" case DoctypeToken: - return "" + return "" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" } @@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() { // nextComment reads the next token starting with " is a valid comment. + z.data.start = z.raw.end + defer func() { + if z.data.end < z.data.start { + // It's a comment with no data, like . + z.data.end = z.data.start + } + }() for dashCount := 2; ; { c := z.readByte() if z.err != nil { - z.data = z.raw + z.data.end = z.raw.end return } switch c { case '-': dashCount++ + continue case '>': if dashCount >= 2 { - z.tt = CommentToken - // TODO: adjust z.data to be only the "x" in "". - // Note that "" is also a valid HTML5 comment. - z.data = z.raw + z.data.end = z.raw.end - len("-->") return } - dashCount = 0 - default: - dashCount = 0 + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// nextBogusComment reads text until the next ">" and treats it as a comment. +// Pre-condition: z.err == nil && z.raw.end is before the first comment byte. +func (z *Tokenizer) nextBogusComment() { + z.tt = CommentToken + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return } } } @@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() { // nextMarkupDeclaration reads the next token starting with "", a "", or "' { - if i >= len(s) { - z.tt = DoctypeToken - z.data.start = z.raw.start + len("") - } + z.data.end = z.raw.end - len(">") return } } @@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() { return } switch { - // TODO: check that the "`, `

`, }, + // DOCTYPE tests. + { + "Proper DOCTYPE", + "", + "", + }, + { + "DOCTYPE with no space", + "", + "", + }, + { + "DOCTYPE with two spaces", + "", + "", + }, + { + "looks like DOCTYPE but isn't", + "", + "", + }, + { + "DOCTYPE at EOF", + "", + }, + // XML processing instructions. + { + "XML processing instruction", + "", + "", + }, // Comments. { "comment0", "abcdef", - "abc$$$def", + "abc$$$$def", }, { "comment1", "az", - "a$z", + "a$$z", }, { "comment2", "az", - "a$z", + "a$$z", }, { "comment3", "az", - "a$z", + "a$$z", }, { "comment4", "az", - "a$z", + "a$$z", }, { "comment5", "az", - "a$<!>z", + "a$$z", }, { "comment6", "az", - "a$<!->z", + "a$$z", }, { "comment7", "a", }, { "comment8", "a", + }, + { + "comment9", + "az", + "a$$z", }, // An attribute with a backslash. { @@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) { loop: for _, tt := range tokenTests { z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) + z.ReturnComments = true for i, s := range strings.Split(tt.golden, "$") { if z.Next() == ErrorToken { t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())