From 18b025d530b2410c74c094c0e78671570c60b7bd Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Tue, 25 Oct 2011 11:28:07 +1100 Subject: [PATCH] html: remove the Tokenizer.ReturnComments option. The original intention was to simplify the parser, in making it skip all comment tokens. However, checking that the Go html package is 100% compatible with the WebKit HTML test suite requires parsing the comments. There is no longer any real benefit for the option. R=gri, andybalholm CC=golang-dev https://golang.org/cl/5321043 --- src/pkg/html/doc.go | 3 --- src/pkg/html/parse.go | 1 - src/pkg/html/token.go | 50 ++++++++++++++++---------------------- src/pkg/html/token_test.go | 1 - 4 files changed, 21 insertions(+), 34 deletions(-) diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go index 5bc0630861a..ba9d188486f 100644 --- a/src/pkg/html/doc.go +++ b/src/pkg/html/doc.go @@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text: } } -A Tokenizer typically skips over HTML comments. To return comment tokens, set -Tokenizer.ReturnComments to true before looping over calls to Next. - Parsing is done by calling Parse with an io.Reader, which returns the root of the parse tree (the document element) as a *Node. It is the caller's responsibility to ensure that the Reader provides UTF-8 encoded HTML. For diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index 2c7294b4f3e..d1d4e483c53 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) { scripting: true, framesetOK: true, } - p.tokenizer.ReturnComments = true // Iterate until EOF. Any other error will cause an early return. im, consumed := initialIM, true for { diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 2826f95f17f..952d17468bd 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -116,10 +116,6 @@ type span struct { // A Tokenizer returns a stream of HTML Tokens. type Tokenizer struct { - // If ReturnComments is set, Next returns comment tokens; - // otherwise it skips over comments (default). - ReturnComments bool - // r is the source of the HTML text. r io.Reader // tt is the TokenType of the current token. @@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() { } } -// next scans the next token and returns its type. -func (z *Tokenizer) next() TokenType { +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { if z.err != nil { - return ErrorToken + z.tt = ErrorToken + return z.tt } z.raw.start = z.raw.end z.data.start = z.raw.end z.data.end = z.raw.end if z.rawTag != "" { z.readRawOrRCDATA() - return TextToken + z.tt = TextToken + return z.tt } z.textIsRaw = false @@ -596,11 +594,13 @@ loop: if x := z.raw.end - len("