From 08a47d6f6087ea2baabca741267a82643d289e92 Mon Sep 17 00:00:00 2001
From: Nigel Tao
Date: Tue, 7 Dec 2010 12:02:36 +1100
Subject: [PATCH] html: first cut at a parser.
R=gri
CC=golang-dev
https://golang.org/cl/3355041
---
src/pkg/html/Makefile | 1 +
src/pkg/html/doc.go | 31 ++-
src/pkg/html/parse.go | 414 +++++++++++++++++++++++++++++++++++++
src/pkg/html/parse_test.go | 153 ++++++++++++++
src/pkg/html/token.go | 82 ++++----
src/pkg/html/token_test.go | 10 +-
6 files changed, 639 insertions(+), 52 deletions(-)
create mode 100644 src/pkg/html/parse.go
create mode 100644 src/pkg/html/parse_test.go
diff --git a/src/pkg/html/Makefile b/src/pkg/html/Makefile
index 4bbd98a9368..00e1c05508d 100644
--- a/src/pkg/html/Makefile
+++ b/src/pkg/html/Makefile
@@ -9,6 +9,7 @@ GOFILES=\
doc.go\
entity.go\
escape.go\
+ parse.go\
token.go\
include ../../Make.pkg
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go
index 9f5d478b42c..c5338d0781d 100644
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -15,7 +15,7 @@ which parses the next token and returns its type, or an error:
for {
tt := z.Next()
- if tt == html.Error {
+ if tt == html.ErrorToken {
// ...
return ...
}
@@ -34,7 +34,7 @@ Entities (such as "<") are unescaped, tag names and attribute keys are
lower-cased, and attributes are collected into a []Attribute. For example:
for {
- if z.Next() == html.Error {
+ if z.Next() == html.ErrorToken {
// Returning os.EOF indicates success.
return z.Error()
}
@@ -49,15 +49,15 @@ call to Next. For example, to extract an HTML page's anchor text:
for {
tt := z.Next()
switch tt {
- case Error:
+ case ErrorToken:
return z.Error()
- case Text:
+ case TextToken:
if depth > 0 {
// emitBytes should copy the []byte it receives,
// if it doesn't process it immediately.
emitBytes(z.Text())
}
- case StartTag, EndTag:
+ case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTag {
@@ -69,6 +69,26 @@ call to Next. For example, to extract an HTML page's anchor text:
}
}
+Parsing is done by calling Parse with an io.Reader, which returns the root of
+the parse tree (the document element) as a *Node. It is the caller's
+responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
+example, to process each anchor node in depth-first order:
+
+ doc, err := html.Parse(r)
+ if err != nil {
+ // ...
+ }
+ var f func(*html.Node)
+ f = func(n *html.Node) {
+ if n.Type == html.ElementNode && n.Data == "a" {
+ // Do something with n...
+ }
+ for _, c := range n.Child {
+ f(c)
+ }
+ }
+ f(doc)
+
The relevant specifications include:
http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and
http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
@@ -82,6 +102,5 @@ package html
// node. Specification compliance is verified by checking expected and actual
// outputs over a test suite rather than aiming for algorithmic fidelity.
-// TODO(nigeltao): Implement a parser, not just a tokenizer.
// TODO(nigeltao): Does a DOM API belong in this package or a separate one?
// TODO(nigeltao): How does parsing interact with a JavaScript engine?
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
new file mode 100644
index 00000000000..d3c1f12135f
--- /dev/null
+++ b/src/pkg/html/parse.go
@@ -0,0 +1,414 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "io"
+ "os"
+)
+
+// A NodeType is the type of a Node.
+type NodeType int
+
+const (
+ ErrorNode NodeType = iota
+ TextNode
+ DocumentNode
+ ElementNode
+ CommentNode
+)
+
+// A Node consists of a NodeType and some Data (tag name for element nodes,
+// content for text) and are part of a tree of Nodes. Element nodes may also
+// contain a slice of Attributes. Data is unescaped, so that it looks like
+// "a are re-interpreted as a two-token sequence:
+ //
followed by . hasSelfClosingToken is true if we have just read
+ // the synthetic start tag and the next one due is the matching end tag.
+ hasSelfClosingToken bool
+ // doc is the document root element.
+ doc *Node
+ // The stack of open elements (section 10.2.3.2).
+ stack []*Node
+ // Element pointers (section 10.2.3.4).
+ head, form *Node
+ // Other parsing state flags (section 10.2.3.5).
+ scripting, framesetOK bool
+}
+
+// pop pops the top of the stack of open elements.
+// It will panic if the stack is empty.
+func (p *parser) pop() *Node {
+ n := len(p.stack)
+ ret := p.stack[n-1]
+ p.stack = p.stack[:n-1]
+ return ret
+}
+
+// push pushes onto the stack of open elements.
+func (p *parser) push(n *Node) {
+ p.stack = append(p.stack, n)
+}
+
+// top returns the top of the stack of open elements.
+// This is also known as the current node.
+func (p *parser) top() *Node {
+ if n := len(p.stack); n > 0 {
+ return p.stack[n-1]
+ }
+ return p.doc
+}
+
+// addChild adds a child node n to the top element, and pushes n
+// if it is an element node (text nodes do not have children).
+func (p *parser) addChild(n *Node) {
+ m := p.top()
+ m.Child = append(m.Child, n)
+ if n.Type == ElementNode {
+ p.push(n)
+ }
+}
+
+// addText adds text to the current node.
+func (p *parser) addText(s string) {
+ // TODO(nigeltao): merge s with previous text, if the preceding node is a text node.
+ // TODO(nigeltao): distinguish whitespace text from others.
+ p.addChild(&Node{
+ Type: TextNode,
+ Data: s,
+ })
+}
+
+// Section 10.2.3.3.
+func (p *parser) addFormattingElement(n *Node) {
+ p.addChild(n)
+ // TODO.
+}
+
+// Section 10.2.3.3.
+func (p *parser) reconstructActiveFormattingElements() {
+ // TODO.
+}
+
+// read reads the next token. This is usually from the tokenizer, but it may
+// be the synthesized end tag implied by a self-closing tag.
+func (p *parser) read() os.Error {
+ if p.hasSelfClosingToken {
+ p.hasSelfClosingToken = false
+ p.tok.Type = EndTagToken
+ p.tok.Attr = nil
+ return nil
+ }
+ if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
+ return p.tokenizer.Error()
+ }
+ p.tok = p.tokenizer.Token()
+ if p.tok.Type == SelfClosingTagToken {
+ p.hasSelfClosingToken = true
+ p.tok.Type = StartTagToken
+ }
+ return nil
+}
+
+// Section 10.2.4.
+func (p *parser) acknowledgeSelfClosingTag() {
+ p.hasSelfClosingToken = false
+}
+
+// Section 10.2.5.4.
+func initialInsertionMode(p *parser) (insertionMode, bool) {
+ // TODO(nigeltao): check p.tok for DOCTYPE.
+ return beforeHTMLInsertionMode, false
+}
+
+// Section 10.2.5.5.
+func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
+ var (
+ add bool
+ attr []Attribute
+ implied bool
+ )
+ switch p.tok.Type {
+ case TextToken:
+ // TODO(nigeltao): distinguish whitespace text from others.
+ implied = true
+ case StartTagToken:
+ if p.tok.Data == "html" {
+ add = true
+ attr = p.tok.Attr
+ } else {
+ implied = true
+ }
+ case EndTagToken:
+ // TODO.
+ }
+ if add || implied {
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: "html",
+ Attr: attr,
+ })
+ }
+ return beforeHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.6.
+func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
+ var (
+ add bool
+ attr []Attribute
+ implied bool
+ )
+ switch p.tok.Type {
+ case TextToken:
+ // TODO(nigeltao): distinguish whitespace text from others.
+ implied = true
+ case StartTagToken:
+ switch p.tok.Data {
+ case "head":
+ add = true
+ attr = p.tok.Attr
+ case "html":
+ // TODO.
+ default:
+ implied = true
+ }
+ case EndTagToken:
+ // TODO.
+ }
+ if add || implied {
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: "head",
+ Attr: attr,
+ })
+ }
+ return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.7.
+func inHeadInsertionMode(p *parser) (insertionMode, bool) {
+ var (
+ pop bool
+ implied bool
+ )
+ switch p.tok.Type {
+ case TextToken:
+ implied = true
+ case StartTagToken:
+ switch p.tok.Data {
+ case "meta":
+ // TODO.
+ case "script":
+ // TODO.
+ default:
+ implied = true
+ }
+ case EndTagToken:
+ if p.tok.Data == "head" {
+ pop = true
+ }
+ // TODO.
+ }
+ if pop || implied {
+ n := p.pop()
+ if n.Data != "head" {
+ panic("html: bad parser state")
+ }
+ return afterHeadInsertionMode, !implied
+ }
+ return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.9.
+func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
+ var (
+ add bool
+ attr []Attribute
+ framesetOK bool
+ implied bool
+ )
+ switch p.tok.Type {
+ case TextToken:
+ implied = true
+ framesetOK = true
+ case StartTagToken:
+ switch p.tok.Data {
+ case "html":
+ // TODO.
+ case "body":
+ add = true
+ attr = p.tok.Attr
+ framesetOK = false
+ case "frameset":
+ // TODO.
+ case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
+ // TODO.
+ case "head":
+ // TODO.
+ default:
+ implied = true
+ framesetOK = true
+ }
+ case EndTagToken:
+ // TODO.
+ }
+ if add || implied {
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: "body",
+ Attr: attr,
+ })
+ p.framesetOK = framesetOK
+ }
+ return inBodyInsertionMode, !implied
+}
+
+// Section 10.2.5.10.
+func inBodyInsertionMode(p *parser) (insertionMode, bool) {
+ var endP bool
+ switch p.tok.Type {
+ case TextToken:
+ p.addText(p.tok.Data)
+ p.framesetOK = false
+ case StartTagToken:
+ switch p.tok.Data {
+ case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
+ // TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2.
+ n := p.top()
+ if n.Type == ElementNode && n.Data == "p" {
+ endP = true
+ } else {
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: p.tok.Data,
+ Attr: p.tok.Attr,
+ })
+ }
+ case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
+ p.reconstructActiveFormattingElements()
+ p.addFormattingElement(&Node{
+ Type: ElementNode,
+ Data: p.tok.Data,
+ Attr: p.tok.Attr,
+ })
+ case "area", "br", "embed", "img", "input", "keygen", "wbr":
+ p.reconstructActiveFormattingElements()
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: p.tok.Data,
+ Attr: p.tok.Attr,
+ })
+ p.pop()
+ p.acknowledgeSelfClosingTag()
+ p.framesetOK = false
+ case "hr":
+ // TODO(nigeltao): auto-insert
if necessary.
+ p.addChild(&Node{
+ Type: ElementNode,
+ Data: p.tok.Data,
+ Attr: p.tok.Attr,
+ })
+ p.pop()
+ p.acknowledgeSelfClosingTag()
+ p.framesetOK = false
+ default:
+ // TODO.
+ }
+ case EndTagToken:
+ switch p.tok.Data {
+ case "body":
+ // TODO(nigeltao): autoclose the stack of open elements.
+ return afterBodyInsertionMode, true
+ case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
+ // TODO(nigeltao): implement the "adoption agency" algorithm:
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
+ p.pop()
+ default:
+ // TODO.
+ }
+ }
+ if endP {
+ // TODO(nigeltao): do the proper algorithm.
+ n := p.pop()
+ if n.Type != ElementNode || n.Data != "p" {
+ panic("unreachable")
+ }
+ }
+ return inBodyInsertionMode, !endP
+}
+
+// Section 10.2.5.22.
+func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
+ switch p.tok.Type {
+ case TextToken:
+ // TODO.
+ case StartTagToken:
+ // TODO.
+ case EndTagToken:
+ switch p.tok.Data {
+ case "html":
+ // TODO(nigeltao): autoclose the stack of open elements.
+ return afterAfterBodyInsertionMode, true
+ default:
+ // TODO.
+ }
+ }
+ return afterBodyInsertionMode, true
+}
+
+// Section 10.2.5.25.
+func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) {
+ return inBodyInsertionMode, false
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+// The input is assumed to be UTF-8 encoded.
+func Parse(r io.Reader) (*Node, os.Error) {
+ p := &parser{
+ tokenizer: NewTokenizer(r),
+ doc: &Node{
+ Type: DocumentNode,
+ },
+ scripting: true,
+ framesetOK: true,
+ }
+ im, consumed := initialInsertionMode, true
+ for {
+ if consumed {
+ if err := p.read(); err != nil {
+ if err == os.EOF {
+ break
+ }
+ return nil, err
+ }
+ }
+ im, consumed = im(p)
+ }
+ // TODO(nigeltao): clean up, depending on the value of im.
+ // The specification's algorithm does clean up on reading an EOF 'token',
+ // but in go we represent EOF by an os.Error instead.
+ return p.doc, nil
+}
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
new file mode 100644
index 00000000000..7fa4f427671
--- /dev/null
+++ b/src/pkg/html/parse_test.go
@@ -0,0 +1,153 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "testing"
+)
+
+type devNull struct{}
+
+func (devNull) Write(p []byte) (int, os.Error) {
+ return len(p), nil
+}
+
+func pipeErr(err os.Error) io.Reader {
+ pr, pw := io.Pipe()
+ pw.CloseWithError(err)
+ return pr
+}
+
+func readDat(filename string, c chan io.Reader) {
+ f, err := os.Open("testdata/webkit/"+filename, os.O_RDONLY, 0600)
+ if err != nil {
+ c <- pipeErr(err)
+ return
+ }
+ defer f.Close()
+
+ // Loop through the lines of the file. Each line beginning with "#" denotes
+ // a new section, which is returned as a separate io.Reader.
+ r := bufio.NewReader(f)
+ var pw *io.PipeWriter
+ for {
+ line, err := r.ReadSlice('\n')
+ if err != nil {
+ if pw != nil {
+ pw.CloseWithError(err)
+ pw = nil
+ } else {
+ c <- pipeErr(err)
+ }
+ return
+ }
+ if len(line) == 0 {
+ continue
+ }
+ if line[0] == '#' {
+ if pw != nil {
+ pw.Close()
+ }
+ var pr *io.PipeReader
+ pr, pw = io.Pipe()
+ c <- pr
+ continue
+ }
+ if line[0] != '|' {
+ // Strip the trailing '\n'.
+ line = line[:len(line)-1]
+ }
+ if pw != nil {
+ if _, err := pw.Write(line); err != nil {
+ pw.CloseWithError(err)
+ pw = nil
+ }
+ }
+ }
+}
+
+func dumpLevel(w io.Writer, n *Node, level int) os.Error {
+ io.WriteString(w, "| ")
+ for i := 0; i < level; i++ {
+ io.WriteString(w, " ")
+ }
+ switch n.Type {
+ case ErrorNode:
+ return os.NewError("unexpected ErrorNode")
+ case DocumentNode:
+ return os.NewError("unexpected DocumentNode")
+ case ElementNode:
+ fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
+ case TextNode:
+ fmt.Fprintf(w, "%q", EscapeString(n.Data))
+ case CommentNode:
+ return os.NewError("COMMENT")
+ default:
+ return os.NewError("unknown node type")
+ }
+ io.WriteString(w, "\n")
+ for _, c := range n.Child {
+ if err := dumpLevel(w, c, level+1); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func dump(n *Node) (string, os.Error) {
+ if n == nil || len(n.Child) == 0 {
+ return "", nil
+ }
+ if len(n.Child) > 1 {
+ return "too many children", nil
+ }
+ b := bytes.NewBuffer(nil)
+ if err := dumpLevel(b, n.Child[0], 0); err != nil {
+ return "", err
+ }
+ return b.String(), nil
+}
+
+func TestParser(t *testing.T) {
+ // TODO(nigeltao): Process all the .dat files, not just the first one.
+ filenames := []string{
+ "tests1.dat",
+ }
+ for _, filename := range filenames {
+ rc := make(chan io.Reader)
+ go readDat(filename, rc)
+ // TODO(nigeltao): Process all test cases, not just the first three.
+ for i := 0; i < 3; i++ {
+ // Parse the #data section.
+ doc, err := Parse(<-rc)
+ if err != nil {
+ t.Fatal(err)
+ }
+ actual, err := dump(doc)
+ if err != nil {
+ t.Fatal(err)
+ }
+ // Skip the #error section.
+ if _, err := io.Copy(devNull{}, <-rc); err != nil {
+ t.Fatal(err)
+ }
+ // Compare the parsed tree to the #document section.
+ b, err := ioutil.ReadAll(<-rc)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expected := string(b)
+ if actual != expected {
+ t.Errorf("%s test #%d, actual vs expected:\n----\n%s----\n%s----", filename, i, actual, expected)
+ }
+ }
+ }
+}
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index 0d4de254308..dc2a6ec5c31 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -15,30 +15,30 @@ import (
type TokenType int
const (
- // Error means that an error occurred during tokenization.
- Error TokenType = iota
- // Text means a text node.
- Text
- // A StartTag looks like .
- StartTag
- // An EndTag looks like .
- EndTag
- // A SelfClosingTag tag looks like
.
- SelfClosingTag
+ // ErrorToken means that an error occurred during tokenization.
+ ErrorToken TokenType = iota
+ // TextToken means a text node.
+ TextToken
+ // A StartTagToken looks like .
+ StartTagToken
+ // An EndTagToken looks like .
+ EndTagToken
+ // A SelfClosingTagToken tag looks like
.
+ SelfClosingTagToken
)
// String returns a string representation of the TokenType.
func (t TokenType) String() string {
switch t {
- case Error:
+ case ErrorToken:
return "Error"
- case Text:
+ case TextToken:
return "Text"
- case StartTag:
+ case StartTagToken:
return "StartTag"
- case EndTag:
+ case EndTagToken:
return "EndTag"
- case SelfClosingTag:
+ case SelfClosingTagToken:
return "SelfClosingTag"
}
return "Invalid(" + strconv.Itoa(int(t)) + ")"
@@ -81,15 +81,15 @@ func (t Token) tagString() string {
// String returns a string representation of the Token.
func (t Token) String() string {
switch t.Type {
- case Error:
+ case ErrorToken:
return ""
- case Text:
+ case TextToken:
return EscapeString(t.Data)
- case StartTag:
+ case StartTagToken:
return "<" + t.tagString() + ">"
- case EndTag:
+ case EndTagToken:
return "" + t.tagString() + ">"
- case SelfClosingTag:
+ case SelfClosingTagToken:
return "<" + t.tagString() + "/>"
}
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
@@ -109,10 +109,10 @@ type Tokenizer struct {
buf []byte
}
-// Error returns the error associated with the most recent Error token. This is
-// typically os.EOF, meaning the end of tokenization.
+// Error returns the error associated with the most recent ErrorToken token.
+// This is typically os.EOF, meaning the end of tokenization.
func (z *Tokenizer) Error() os.Error {
- if z.tt != Error {
+ if z.tt != ErrorToken {
return nil
}
return z.err
@@ -180,40 +180,40 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
c, err := z.readByte()
if err != nil {
- return Error, err
+ return ErrorToken, err
}
switch {
case c == '/':
- tt = EndTag
+ tt = EndTagToken
// Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
- tt = StartTag
+ tt = StartTagToken
case c == '!':
- return Error, os.NewError("html: TODO(nigeltao): implement comments")
+ return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
case c == '?':
- return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
+ return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
default:
- return Error, os.NewError("html: TODO(nigeltao): handle malformed tags")
+ return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags")
}
for {
c, err := z.readByte()
if err != nil {
- return Text, err
+ return TextToken, err
}
switch c {
case '"':
err = z.readTo('"')
if err != nil {
- return Text, err
+ return TextToken, err
}
case '\'':
err = z.readTo('\'')
if err != nil {
- return Text, err
+ return TextToken, err
}
case '>':
- if z.buf[z.p1-2] == '/' && tt == StartTag {
- return SelfClosingTag, nil
+ if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+ return SelfClosingTagToken, nil
}
return tt, nil
}
@@ -224,13 +224,13 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
if z.err != nil {
- z.tt = Error
+ z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
c, err := z.readByte()
if err != nil {
- z.tt, z.err = Error, err
+ z.tt, z.err = ErrorToken, err
return z.tt
}
if c == '<' {
@@ -240,15 +240,15 @@ func (z *Tokenizer) Next() TokenType {
for {
c, err := z.readByte()
if err != nil {
- z.tt, z.err = Error, err
+ z.tt, z.err = ErrorToken, err
if err == os.EOF {
- z.tt = Text
+ z.tt = TextToken
}
return z.tt
}
if c == '<' {
z.p1--
- z.tt = Text
+ z.tt = TextToken
return z.tt
}
}
@@ -371,9 +371,9 @@ loop:
func (z *Tokenizer) Token() Token {
t := Token{Type: z.tt}
switch z.tt {
- case Text:
+ case TextToken:
t.Data = string(z.Text())
- case StartTag, EndTag, SelfClosingTag:
+ case StartTagToken, EndTagToken, SelfClosingTagToken:
var attr []Attribute
name, remaining := z.TagName()
for remaining {
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index 5759476eab4..7dbe13ddfe6 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -88,7 +88,7 @@ loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
for i, s := range tt.tokens {
- if z.Next() == Error {
+ if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
continue loop
}
@@ -134,19 +134,19 @@ loop:
for {
tt := z.Next()
switch tt {
- case Error:
+ case ErrorToken:
if z.Error() != os.EOF {
t.Error(z.Error())
}
break loop
- case Text:
+ case TextToken:
if depth > 0 {
result.Write(z.Text())
}
- case StartTag, EndTag:
+ case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
- if tt == StartTag {
+ if tt == StartTagToken {
depth++
} else {
depth--