html: remove the Tokenizer.ReturnComments option.

The original intention was to simplify the parser, in making it skip all comment tokens. However, checking that the Go html package is 100% compatible with the WebKit HTML test suite requires parsing the comments. There is no longer any real benefit for the option. R=gri, andybalholm CC=golang-dev https://golang.org/cl/5321043
2024-11-22 04:14:42 -07:00 · 2011-10-25 11:28:07 +11:00 · 2011-10-25 11:28:07 +11:00 · 18b025d530
commit 18b025d530
parent 5791233461
4 changed files with 21 additions and 34 deletions
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text:
 		}
 	}
 A Tokenizer typically skips over HTML comments. To return comment tokens, set
 Tokenizer.ReturnComments to true before looping over calls to Next.
 Parsing is done by calling Parse with an io.Reader, which returns the root of
 the parse tree (the document element) as a *Node. It is the caller's
 responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) {
 		scripting:  true,
 		framesetOK: true,
 	}
 	p.tokenizer.ReturnComments = true
 	// Iterate until EOF. Any other error will cause an early return.
 	im, consumed := initialIM, true
 	for {
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -116,10 +116,6 @@ type span struct {
 // A Tokenizer returns a stream of HTML Tokens.
 type Tokenizer struct {
 	// If ReturnComments is set, Next returns comment tokens;
 	// otherwise it skips over comments (default).
 	ReturnComments bool
 	// r is the source of the HTML text.
 	r io.Reader
 	// tt is the TokenType of the current token.
@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() {
 	}
 }
-// next scans the next token and returns its type.
+// Next scans the next token and returns its type.
-func (z *Tokenizer) next() TokenType {
+func (z *Tokenizer) Next() TokenType {
 	if z.err != nil {
-		return ErrorToken
+		z.tt = ErrorToken
 		return z.tt
 	}
 	z.raw.start = z.raw.end
 	z.data.start = z.raw.end
 	z.data.end = z.raw.end
 	if z.rawTag != "" {
 		z.readRawOrRCDATA()
-		return TextToken
+		z.tt = TextToken
 		return z.tt
 	}
 	z.textIsRaw = false
@ -596,11 +594,13 @@ loop:
 		if x := z.raw.end - len("<a"); z.raw.start < x {
 			z.raw.end = x
 			z.data.end = x
-			return TextToken
+			z.tt = TextToken
 			return z.tt
 		}
 		switch tokenType {
 		case StartTagToken:
-			return z.readStartTag()
+			z.tt = z.readStartTag()
 			return z.tt
 		case EndTagToken:
 			c = z.readByte()
 			if z.err != nil {
@ -616,39 +616,31 @@ loop:
 			}
 			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 				z.readEndTag()
-				return EndTagToken
+				z.tt = EndTagToken
 				return z.tt
 			}
 			z.raw.end--
 			z.readUntilCloseAngle()
-			return CommentToken
+			z.tt = CommentToken
 			return z.tt
 		case CommentToken:
 			if c == '!' {
-				return z.readMarkupDeclaration()
+				z.tt = z.readMarkupDeclaration()
 				return z.tt
 			}
 			z.raw.end--
 			z.readUntilCloseAngle()
-			return CommentToken
+			z.tt = CommentToken
 			return z.tt
 		}
 	}
 	if z.raw.start < z.raw.end {
 		z.data.end = z.raw.end
-		return TextToken
+		z.tt = TextToken
 	}
 	return ErrorToken
 }
 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
 	for {
 		z.tt = z.next()
 		// TODO: remove the ReturnComments option. A tokenizer should
 		// always return comment tags.
 		if z.tt == CommentToken && !z.ReturnComments {
 			continue
 		}
 		return z.tt
 	}
-	panic("unreachable")
+	z.tt = ErrorToken
 	return z.tt
 }
 // Raw returns the unmodified text of the current token. Calling Next, Token,
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -424,7 +424,6 @@ func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(strings.NewReader(tt.html))
 		z.ReturnComments = true
 		if tt.golden != "" {
 			for i, s := range strings.Split(tt.golden, "$") {
 				if z.Next() == ErrorToken {