html: tokenize HTML comments.

I'm not sure if it's 100% correct wrt the HTML5 specification, but the test suite has plenty of HTML comment test cases, and we'll shake out any tokenization bugs as the parser improves its coverage. R=gri CC=golang-dev https://golang.org/cl/4186055
2024-11-24 21:00:09 -07:00 · 2011-02-17 10:45:30 +11:00 · 2011-02-17 10:45:30 +11:00 · a5ff8ad9db
commit a5ff8ad9db
parent 3a2d64789b
3 changed files with 152 additions and 54 deletions
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text:
 		}
 	}

+A Tokenizer typically skips over HTML comments. To return comment tokens, set
+Tokenizer.ReturnComments to true before looping over calls to Next.
+
 Parsing is done by calling Parse with an io.Reader, which returns the root of
 the parse tree (the document element) as a *Node. It is the caller's
 responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -25,6 +25,8 @@ const (
 	EndTagToken
 	// A SelfClosingTagToken tag looks like <br/>.
 	SelfClosingTagToken
+	// A CommentToken looks like <!--x-->.
+	CommentToken
 )

 // String returns a string representation of the TokenType.
@ -40,6 +42,8 @@ func (t TokenType) String() string {
 		return "EndTag"
 	case SelfClosingTagToken:
 		return "SelfClosingTag"
+	case CommentToken:
+		return "Comment"
 	}
 	return "Invalid(" + strconv.Itoa(int(t)) + ")"
 }
@ -52,8 +56,8 @@ type Attribute struct {
 }

 // A Token consists of a TokenType and some Data (tag name for start and end
-// tags, content for text). A tag Token may also contain a slice of Attributes.
-// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather
+// tags, content for text and comments). A tag Token may also contain a slice
+// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather
 // than "a&lt;b").
 type Token struct {
 	Type TokenType
@ -91,12 +95,18 @@ func (t Token) String() string {
 		return "</" + t.tagString() + ">"
 	case SelfClosingTagToken:
 		return "<" + t.tagString() + "/>"
+	case CommentToken:
+		return "<!--" + EscapeString(t.Data) + "-->"
 	}
 	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
 }

 // A Tokenizer returns a stream of HTML Tokens.
 type Tokenizer struct {
+	// If ReturnComments is set, Next returns comment tokens;
+	// otherwise it skips over comments (default).
+	ReturnComments bool
+
 	// r is the source of the HTML text.
 	r io.Reader
 	// tt is the TokenType of the most recently read token. If tt == Error
@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
 	panic("unreachable")
 }

+// nextMarkupDeclaration returns the next TokenType starting with "<!".
+func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) {
+	// TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment.
+	for i := 0; i < 2; i++ {
+		c, err := z.readByte()
+		if err != nil {
+			return TextToken, err
+		}
+		if c != '-' {
+			return z.nextText(), nil
+		}
+	}
+	// <!--> is a valid comment.
+	for dashCount := 2; ; {
+		c, err := z.readByte()
+		if err != nil {
+			return TextToken, err
+		}
+		switch c {
+		case '-':
+			dashCount++
+		case '>':
+			if dashCount >= 2 {
+				return CommentToken, nil
+			}
+			fallthrough
+		default:
+			dashCount = 0
+		}
+	}
+	panic("unreachable")
+}
+
 // nextTag returns the next TokenType starting from the tag open state.
 func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 	c, err := z.readByte()
@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 	case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
 		tt = StartTagToken
 	case c == '!':
-		return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
+		return z.nextMarkupDeclaration()
 	case c == '?':
 		return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
 	default:
@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 	panic("unreachable")
 }

-// Next scans the next token and returns its type.
-func (z *Tokenizer) Next() TokenType {
-	if z.err != nil {
-		z.tt = ErrorToken
-		return z.tt
-	}
-	z.p0 = z.p1
-	c, err := z.readByte()
-	if err != nil {
-		z.tt, z.err = ErrorToken, err
-		return z.tt
-	}
-	if c == '<' {
-		z.tt, z.err = z.nextTag()
-		return z.tt
-	}
+// nextText reads all text up until an '<'.
+func (z *Tokenizer) nextText() TokenType {
 	for {
 		c, err := z.readByte()
 		if err != nil {
@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType {
 	panic("unreachable")
 }

+// Next scans the next token and returns its type.
+func (z *Tokenizer) Next() TokenType {
+	for {
+		if z.err != nil {
+			z.tt = ErrorToken
+			return z.tt
+		}
+		z.p0 = z.p1
+		c, err := z.readByte()
+		if err != nil {
+			z.tt, z.err = ErrorToken, err
+			return z.tt
+		}
+		if c == '<' {
+			z.tt, z.err = z.nextTag()
+			if z.tt == CommentToken && !z.ReturnComments {
+				continue
+			}
+			return z.tt
+		}
+		return z.nextText()
+	}
+	panic("unreachable")
+}
+
 // trim returns the largest j such that z.buf[i:j] contains only white space,
 // or only white space plus the final ">" or "/>" of the raw data.
 func (z *Tokenizer) trim(i int) int {
@ -299,12 +353,27 @@ loop:
 	return z.buf[i0:i], z.trim(i)
 }

-// Text returns the raw data after unescaping.
+// Text returns the unescaped text of a TextToken or a CommentToken.
 // The contents of the returned slice may change on the next call to Next.
 func (z *Tokenizer) Text() []byte {
-	s := unescape(z.Raw())
-	z.p0 = z.p1
-	return s
+	switch z.tt {
+	case TextToken:
+		s := unescape(z.Raw())
+		z.p0 = z.p1
+		return s
+	case CommentToken:
+		// We trim the "<!--" from the left and the "-->" from the right.
+		// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
+		i0 := z.p0 + 4
+		i1 := z.p1 - 3
+		z.p0 = z.p1
+		var s []byte
+		if i0 < i1 {
+			s = unescape(z.buf[i0:i1])
+		}
+		return s
+	}
+	return nil
 }

 // TagName returns the lower-cased name of a tag token (the `img` out of
@ -372,7 +441,7 @@ loop:
 func (z *Tokenizer) Token() Token {
 	t := Token{Type: z.tt}
 	switch z.tt {
-	case TextToken:
+	case TextToken, CommentToken:
 		t.Data = string(z.Text())
 	case StartTagToken, EndTagToken, SelfClosingTagToken:
 		var attr []Attribute
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -7,6 +7,7 @@ package html
 import (
 	"bytes"
 	"os"
+	"strings"
 	"testing"
 )

@ -15,8 +16,8 @@ type tokenTest struct {
 	desc string
 	// The HTML to parse.
 	html string
-	// The string representations of the expected tokens.
-	tokens []string
+	// The string representations of the expected tokens, joined by '$'.
+	golden string
 }

 var tokenTests = []tokenTest{
@ -25,61 +26,86 @@ var tokenTests = []tokenTest{
 	{
 		"text",
 		"foo  bar",
-		[]string{
-			"foo  bar",
-		},
+		"foo  bar",
 	},
 	// An entity.
 	{
 		"entity",
 		"one &lt; two",
-		[]string{
-			"one &lt; two",
-		},
+		"one &lt; two",
 	},
 	// A start, self-closing and end tag. The tokenizer does not care if the start
 	// and end tokens don't match; that is the job of the parser.
 	{
 		"tags",
 		"<a>b<c/>d</e>",
-		[]string{
-			"<a>",
-			"b",
-			"<c/>",
-			"d",
-			"</e>",
-		},
+		"<a>$b$<c/>$d$</e>",
+	},
+	// Comments.
+	{
+		"comment0",
+		"abc<b><!-- skipme --></b>def",
+		"abc$<b>$</b>$def",
+	},
+	{
+		"comment1",
+		"a<!-->z",
+		"a$z",
+	},
+	{
+		"comment2",
+		"a<!--->z",
+		"a$z",
+	},
+	{
+		"comment3",
+		"a<!--x>-->z",
+		"a$z",
+	},
+	{
+		"comment4",
+		"a<!--x->-->z",
+		"a$z",
+	},
+	{
+		"comment5",
+		"a<!>z",
+		"a$&lt;!&gt;z",
+	},
+	{
+		"comment6",
+		"a<!->z",
+		"a$&lt;!-&gt;z",
+	},
+	{
+		"comment7",
+		"a<!---<>z",
+		"a$&lt;!---&lt;&gt;z",
+	},
+	{
+		"comment8",
+		"a<!--z",
+		"a$&lt;!--z",
 	},
 	// An attribute with a backslash.
 	{
 		"backslash",
 		`<p id="a\"b">`,
-		[]string{
-			`<p id="a&quot;b">`,
-		},
+		`<p id="a&quot;b">`,
 	},
 	// Entities, tag name and attribute key lower-casing, and whitespace
 	// normalization within a tag.
 	{
 		"tricky",
 		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
-		[]string{
-			`<p id="a&quot;B" foo="bar">`,
-			"<em>",
-			"te&lt;&amp;;xt",
-			"</em>",
-			"</p>",
-		},
+		`<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 	},
 	// A non-existant entity. Tokenizing and converting back to a string should
 	// escape the "&" to become "&amp;".
 	{
 		"noSuchEntity",
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
-		[]string{
-			`<a b="c&amp;noSuchEntity;d">`,
-			"&lt;&amp;alsoDoesntExist;&amp;",
-		},
+		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
 }

@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
-		for i, s := range tt.tokens {
+		for i, s := range strings.Split(tt.golden, "$", -1) {
 			if z.Next() == ErrorToken {
 				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
 				continue loop