mirror of
https://github.com/golang/go
synced 2024-11-24 22:00:09 -07:00
html: remove the Tokenizer.ReturnComments option.
The original intention was to simplify the parser, in making it skip all comment tokens. However, checking that the Go html package is 100% compatible with the WebKit HTML test suite requires parsing the comments. There is no longer any real benefit for the option. R=gri, andybalholm CC=golang-dev https://golang.org/cl/5321043
This commit is contained in:
parent
5791233461
commit
18b025d530
@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
A Tokenizer typically skips over HTML comments. To return comment tokens, set
|
|
||||||
Tokenizer.ReturnComments to true before looping over calls to Next.
|
|
||||||
|
|
||||||
Parsing is done by calling Parse with an io.Reader, which returns the root of
|
Parsing is done by calling Parse with an io.Reader, which returns the root of
|
||||||
the parse tree (the document element) as a *Node. It is the caller's
|
the parse tree (the document element) as a *Node. It is the caller's
|
||||||
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
|
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
|
||||||
|
@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) {
|
|||||||
scripting: true,
|
scripting: true,
|
||||||
framesetOK: true,
|
framesetOK: true,
|
||||||
}
|
}
|
||||||
p.tokenizer.ReturnComments = true
|
|
||||||
// Iterate until EOF. Any other error will cause an early return.
|
// Iterate until EOF. Any other error will cause an early return.
|
||||||
im, consumed := initialIM, true
|
im, consumed := initialIM, true
|
||||||
for {
|
for {
|
||||||
|
@ -116,10 +116,6 @@ type span struct {
|
|||||||
|
|
||||||
// A Tokenizer returns a stream of HTML Tokens.
|
// A Tokenizer returns a stream of HTML Tokens.
|
||||||
type Tokenizer struct {
|
type Tokenizer struct {
|
||||||
// If ReturnComments is set, Next returns comment tokens;
|
|
||||||
// otherwise it skips over comments (default).
|
|
||||||
ReturnComments bool
|
|
||||||
|
|
||||||
// r is the source of the HTML text.
|
// r is the source of the HTML text.
|
||||||
r io.Reader
|
r io.Reader
|
||||||
// tt is the TokenType of the current token.
|
// tt is the TokenType of the current token.
|
||||||
@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// next scans the next token and returns its type.
|
// Next scans the next token and returns its type.
|
||||||
func (z *Tokenizer) next() TokenType {
|
func (z *Tokenizer) Next() TokenType {
|
||||||
if z.err != nil {
|
if z.err != nil {
|
||||||
return ErrorToken
|
z.tt = ErrorToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
z.raw.start = z.raw.end
|
z.raw.start = z.raw.end
|
||||||
z.data.start = z.raw.end
|
z.data.start = z.raw.end
|
||||||
z.data.end = z.raw.end
|
z.data.end = z.raw.end
|
||||||
if z.rawTag != "" {
|
if z.rawTag != "" {
|
||||||
z.readRawOrRCDATA()
|
z.readRawOrRCDATA()
|
||||||
return TextToken
|
z.tt = TextToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
z.textIsRaw = false
|
z.textIsRaw = false
|
||||||
|
|
||||||
@ -596,11 +594,13 @@ loop:
|
|||||||
if x := z.raw.end - len("<a"); z.raw.start < x {
|
if x := z.raw.end - len("<a"); z.raw.start < x {
|
||||||
z.raw.end = x
|
z.raw.end = x
|
||||||
z.data.end = x
|
z.data.end = x
|
||||||
return TextToken
|
z.tt = TextToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
switch tokenType {
|
switch tokenType {
|
||||||
case StartTagToken:
|
case StartTagToken:
|
||||||
return z.readStartTag()
|
z.tt = z.readStartTag()
|
||||||
|
return z.tt
|
||||||
case EndTagToken:
|
case EndTagToken:
|
||||||
c = z.readByte()
|
c = z.readByte()
|
||||||
if z.err != nil {
|
if z.err != nil {
|
||||||
@ -616,39 +616,31 @@ loop:
|
|||||||
}
|
}
|
||||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
||||||
z.readEndTag()
|
z.readEndTag()
|
||||||
return EndTagToken
|
z.tt = EndTagToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
z.raw.end--
|
z.raw.end--
|
||||||
z.readUntilCloseAngle()
|
z.readUntilCloseAngle()
|
||||||
return CommentToken
|
z.tt = CommentToken
|
||||||
|
return z.tt
|
||||||
case CommentToken:
|
case CommentToken:
|
||||||
if c == '!' {
|
if c == '!' {
|
||||||
return z.readMarkupDeclaration()
|
z.tt = z.readMarkupDeclaration()
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
z.raw.end--
|
z.raw.end--
|
||||||
z.readUntilCloseAngle()
|
z.readUntilCloseAngle()
|
||||||
return CommentToken
|
z.tt = CommentToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if z.raw.start < z.raw.end {
|
if z.raw.start < z.raw.end {
|
||||||
z.data.end = z.raw.end
|
z.data.end = z.raw.end
|
||||||
return TextToken
|
z.tt = TextToken
|
||||||
}
|
|
||||||
return ErrorToken
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next scans the next token and returns its type.
|
|
||||||
func (z *Tokenizer) Next() TokenType {
|
|
||||||
for {
|
|
||||||
z.tt = z.next()
|
|
||||||
// TODO: remove the ReturnComments option. A tokenizer should
|
|
||||||
// always return comment tags.
|
|
||||||
if z.tt == CommentToken && !z.ReturnComments {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return z.tt
|
return z.tt
|
||||||
}
|
}
|
||||||
panic("unreachable")
|
z.tt = ErrorToken
|
||||||
|
return z.tt
|
||||||
}
|
}
|
||||||
|
|
||||||
// Raw returns the unmodified text of the current token. Calling Next, Token,
|
// Raw returns the unmodified text of the current token. Calling Next, Token,
|
||||||
|
@ -424,7 +424,6 @@ func TestTokenizer(t *testing.T) {
|
|||||||
loop:
|
loop:
|
||||||
for _, tt := range tokenTests {
|
for _, tt := range tokenTests {
|
||||||
z := NewTokenizer(strings.NewReader(tt.html))
|
z := NewTokenizer(strings.NewReader(tt.html))
|
||||||
z.ReturnComments = true
|
|
||||||
if tt.golden != "" {
|
if tt.golden != "" {
|
||||||
for i, s := range strings.Split(tt.golden, "$") {
|
for i, s := range strings.Split(tt.golden, "$") {
|
||||||
if z.Next() == ErrorToken {
|
if z.Next() == ErrorToken {
|
||||||
|
Loading…
Reference in New Issue
Block a user