html: first cut at a parser.

R=gri CC=golang-dev https://golang.org/cl/3355041
2024-10-03 19:31:21 -06:00 · 2010-12-07 12:02:36 +11:00 · 2010-12-07 12:02:36 +11:00 · 08a47d6f60
commit 08a47d6f60
parent 2fd2991eac
6 changed files with 639 additions and 52 deletions
--- a/src/pkg/html/Makefile
+++ b/src/pkg/html/Makefile
@ -9,6 +9,7 @@ GOFILES=\
 	doc.go\
 	entity.go\
 	escape.go\
 	parse.go\
 	token.go\
 include ../../Make.pkg
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@ -15,7 +15,7 @@ which parses the next token and returns its type, or an error:
 	for {
 		tt := z.Next()
-		if tt == html.Error {
+		if tt == html.ErrorToken {
 			// ...
 			return ...
 		}
@ -34,7 +34,7 @@ Entities (such as "&lt;") are unescaped, tag names and attribute keys are
 lower-cased, and attributes are collected into a []Attribute. For example:
 	for {
-		if z.Next() == html.Error {
+		if z.Next() == html.ErrorToken {
 			// Returning os.EOF indicates success.
 			return z.Error()
 		}
@ -49,15 +49,15 @@ call to Next. For example, to extract an HTML page's anchor text:
 	for {
 		tt := z.Next()
 		switch tt {
-		case Error:
+		case ErrorToken:
 			return z.Error()
-		case Text:
+		case TextToken:
 			if depth > 0 {
 				// emitBytes should copy the []byte it receives,
 				// if it doesn't process it immediately.
 				emitBytes(z.Text())
 			}
-		case StartTag, EndTag:
+		case StartTagToken, EndTagToken:
 			tn, _ := z.TagName()
 			if len(tn) == 1 && tn[0] == 'a' {
 				if tt == StartTag {
@ -69,6 +69,26 @@ call to Next. For example, to extract an HTML page's anchor text:
 		}
 	}
 Parsing is done by calling Parse with an io.Reader, which returns the root of
 the parse tree (the document element) as a *Node. It is the caller's
 responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
 example, to process each anchor node in depth-first order:
 	doc, err := html.Parse(r)
 	if err != nil {
 		// ...
 	}
 	var f func(*html.Node)
 	f = func(n *html.Node) {
 		if n.Type == html.ElementNode && n.Data == "a" {
 			// Do something with n...
 		}
 		for _, c := range n.Child {
 			f(c)
 		}
 	}
 	f(doc)
 The relevant specifications include:
 http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and
 http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
@ -82,6 +102,5 @@ package html
 // node. Specification compliance is verified by checking expected and actual
 // outputs over a test suite rather than aiming for algorithmic fidelity.
 // TODO(nigeltao): Implement a parser, not just a tokenizer.
 // TODO(nigeltao): Does a DOM API belong in this package or a separate one?
 // TODO(nigeltao): How does parsing interact with a JavaScript engine?
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@ -0,0 +1,414 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package html
 import (
 	"io"
 	"os"
 )
 // A NodeType is the type of a Node.
 type NodeType int
 const (
 	ErrorNode NodeType = iota
 	TextNode
 	DocumentNode
 	ElementNode
 	CommentNode
 )
 // A Node consists of a NodeType and some Data (tag name for element nodes,
 // content for text) and are part of a tree of Nodes. Element nodes may also
 // contain a slice of Attributes. Data is unescaped, so that it looks like
 // "a<b" rather than "a&lt;b".
 type Node struct {
 	Parent *Node
 	Child  []*Node
 	Type   NodeType
 	Data   string
 	Attr   []Attribute
 }
 // An insertion mode (section 10.2.3.1) is the state transition function from
 // a particular state in the HTML5 parser's state machine. In addition to
 // returning the next state, it also returns whether the token was consumed.
 type insertionMode func(*parser) (insertionMode, bool)
 // A parser implements the HTML5 parsing algorithm:
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
 type parser struct {
 	// tokenizer provides the tokens for the parser.
 	tokenizer *Tokenizer
 	// tok is the most recently read token.
 	tok Token
 	// Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
 	// <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
 	// the synthetic start tag and the next one due is the matching end tag.
 	hasSelfClosingToken bool
 	// doc is the document root element.
 	doc *Node
 	// The stack of open elements (section 10.2.3.2).
 	stack []*Node
 	// Element pointers (section 10.2.3.4).
 	head, form *Node
 	// Other parsing state flags (section 10.2.3.5).
 	scripting, framesetOK bool
 }
 // pop pops the top of the stack of open elements.
 // It will panic if the stack is empty.
 func (p *parser) pop() *Node {
 	n := len(p.stack)
 	ret := p.stack[n-1]
 	p.stack = p.stack[:n-1]
 	return ret
 }
 // push pushes onto the stack of open elements.
 func (p *parser) push(n *Node) {
 	p.stack = append(p.stack, n)
 }
 // top returns the top of the stack of open elements.
 // This is also known as the current node.
 func (p *parser) top() *Node {
 	if n := len(p.stack); n > 0 {
 		return p.stack[n-1]
 	}
 	return p.doc
 }
 // addChild adds a child node n to the top element, and pushes n
 // if it is an element node (text nodes do not have children).
 func (p *parser) addChild(n *Node) {
 	m := p.top()
 	m.Child = append(m.Child, n)
 	if n.Type == ElementNode {
 		p.push(n)
 	}
 }
 // addText adds text to the current node.
 func (p *parser) addText(s string) {
 	// TODO(nigeltao): merge s with previous text, if the preceding node is a text node.
 	// TODO(nigeltao): distinguish whitespace text from others.
 	p.addChild(&Node{
 		Type: TextNode,
 		Data: s,
 	})
 }
 // Section 10.2.3.3.
 func (p *parser) addFormattingElement(n *Node) {
 	p.addChild(n)
 	// TODO.
 }
 // Section 10.2.3.3.
 func (p *parser) reconstructActiveFormattingElements() {
 	// TODO.
 }
 // read reads the next token. This is usually from the tokenizer, but it may
 // be the synthesized end tag implied by a self-closing tag.
 func (p *parser) read() os.Error {
 	if p.hasSelfClosingToken {
 		p.hasSelfClosingToken = false
 		p.tok.Type = EndTagToken
 		p.tok.Attr = nil
 		return nil
 	}
 	if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
 		return p.tokenizer.Error()
 	}
 	p.tok = p.tokenizer.Token()
 	if p.tok.Type == SelfClosingTagToken {
 		p.hasSelfClosingToken = true
 		p.tok.Type = StartTagToken
 	}
 	return nil
 }
 // Section 10.2.4.
 func (p *parser) acknowledgeSelfClosingTag() {
 	p.hasSelfClosingToken = false
 }
 // Section 10.2.5.4.
 func initialInsertionMode(p *parser) (insertionMode, bool) {
 	// TODO(nigeltao): check p.tok for DOCTYPE.
 	return beforeHTMLInsertionMode, false
 }
 // Section 10.2.5.5.
 func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
 	var (
 		add     bool
 		attr    []Attribute
 		implied bool
 	)
 	switch p.tok.Type {
 	case TextToken:
 		// TODO(nigeltao): distinguish whitespace text from others.
 		implied = true
 	case StartTagToken:
 		if p.tok.Data == "html" {
 			add = true
 			attr = p.tok.Attr
 		} else {
 			implied = true
 		}
 	case EndTagToken:
 		// TODO.
 	}
 	if add || implied {
 		p.addChild(&Node{
 			Type: ElementNode,
 			Data: "html",
 			Attr: attr,
 		})
 	}
 	return beforeHeadInsertionMode, !implied
 }
 // Section 10.2.5.6.
 func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
 	var (
 		add     bool
 		attr    []Attribute
 		implied bool
 	)
 	switch p.tok.Type {
 	case TextToken:
 		// TODO(nigeltao): distinguish whitespace text from others.
 		implied = true
 	case StartTagToken:
 		switch p.tok.Data {
 		case "head":
 			add = true
 			attr = p.tok.Attr
 		case "html":
 			// TODO.
 		default:
 			implied = true
 		}
 	case EndTagToken:
 		// TODO.
 	}
 	if add || implied {
 		p.addChild(&Node{
 			Type: ElementNode,
 			Data: "head",
 			Attr: attr,
 		})
 	}
 	return inHeadInsertionMode, !implied
 }
 // Section 10.2.5.7.
 func inHeadInsertionMode(p *parser) (insertionMode, bool) {
 	var (
 		pop     bool
 		implied bool
 	)
 	switch p.tok.Type {
 	case TextToken:
 		implied = true
 	case StartTagToken:
 		switch p.tok.Data {
 		case "meta":
 			// TODO.
 		case "script":
 			// TODO.
 		default:
 			implied = true
 		}
 	case EndTagToken:
 		if p.tok.Data == "head" {
 			pop = true
 		}
 		// TODO.
 	}
 	if pop || implied {
 		n := p.pop()
 		if n.Data != "head" {
 			panic("html: bad parser state")
 		}
 		return afterHeadInsertionMode, !implied
 	}
 	return inHeadInsertionMode, !implied
 }
 // Section 10.2.5.9.
 func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
 	var (
 		add        bool
 		attr       []Attribute
 		framesetOK bool
 		implied    bool
 	)
 	switch p.tok.Type {
 	case TextToken:
 		implied = true
 		framesetOK = true
 	case StartTagToken:
 		switch p.tok.Data {
 		case "html":
 			// TODO.
 		case "body":
 			add = true
 			attr = p.tok.Attr
 			framesetOK = false
 		case "frameset":
 			// TODO.
 		case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
 			// TODO.
 		case "head":
 			// TODO.
 		default:
 			implied = true
 			framesetOK = true
 		}
 	case EndTagToken:
 		// TODO.
 	}
 	if add || implied {
 		p.addChild(&Node{
 			Type: ElementNode,
 			Data: "body",
 			Attr: attr,
 		})
 		p.framesetOK = framesetOK
 	}
 	return inBodyInsertionMode, !implied
 }
 // Section 10.2.5.10.
 func inBodyInsertionMode(p *parser) (insertionMode, bool) {
 	var endP bool
 	switch p.tok.Type {
 	case TextToken:
 		p.addText(p.tok.Data)
 		p.framesetOK = false
 	case StartTagToken:
 		switch p.tok.Data {
 		case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
 			// TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2.
 			n := p.top()
 			if n.Type == ElementNode && n.Data == "p" {
 				endP = true
 			} else {
 				p.addChild(&Node{
 					Type: ElementNode,
 					Data: p.tok.Data,
 					Attr: p.tok.Attr,
 				})
 			}
 		case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
 			p.reconstructActiveFormattingElements()
 			p.addFormattingElement(&Node{
 				Type: ElementNode,
 				Data: p.tok.Data,
 				Attr: p.tok.Attr,
 			})
 		case "area", "br", "embed", "img", "input", "keygen", "wbr":
 			p.reconstructActiveFormattingElements()
 			p.addChild(&Node{
 				Type: ElementNode,
 				Data: p.tok.Data,
 				Attr: p.tok.Attr,
 			})
 			p.pop()
 			p.acknowledgeSelfClosingTag()
 			p.framesetOK = false
 		case "hr":
 			// TODO(nigeltao): auto-insert </p> if necessary.
 			p.addChild(&Node{
 				Type: ElementNode,
 				Data: p.tok.Data,
 				Attr: p.tok.Attr,
 			})
 			p.pop()
 			p.acknowledgeSelfClosingTag()
 			p.framesetOK = false
 		default:
 			// TODO.
 		}
 	case EndTagToken:
 		switch p.tok.Data {
 		case "body":
 			// TODO(nigeltao): autoclose the stack of open elements.
 			return afterBodyInsertionMode, true
 		case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
 			// TODO(nigeltao): implement the "adoption agency" algorithm:
 			// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
 			p.pop()
 		default:
 			// TODO.
 		}
 	}
 	if endP {
 		// TODO(nigeltao): do the proper algorithm.
 		n := p.pop()
 		if n.Type != ElementNode || n.Data != "p" {
 			panic("unreachable")
 		}
 	}
 	return inBodyInsertionMode, !endP
 }
 // Section 10.2.5.22.
 func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
 	switch p.tok.Type {
 	case TextToken:
 		// TODO.
 	case StartTagToken:
 		// TODO.
 	case EndTagToken:
 		switch p.tok.Data {
 		case "html":
 			// TODO(nigeltao): autoclose the stack of open elements.
 			return afterAfterBodyInsertionMode, true
 		default:
 			// TODO.
 		}
 	}
 	return afterBodyInsertionMode, true
 }
 // Section 10.2.5.25.
 func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) {
 	return inBodyInsertionMode, false
 }
 // Parse returns the parse tree for the HTML from the given Reader.
 // The input is assumed to be UTF-8 encoded.
 func Parse(r io.Reader) (*Node, os.Error) {
 	p := &parser{
 		tokenizer: NewTokenizer(r),
 		doc: &Node{
 			Type: DocumentNode,
 		},
 		scripting:  true,
 		framesetOK: true,
 	}
 	im, consumed := initialInsertionMode, true
 	for {
 		if consumed {
 			if err := p.read(); err != nil {
 				if err == os.EOF {
 					break
 				}
 				return nil, err
 			}
 		}
 		im, consumed = im(p)
 	}
 	// TODO(nigeltao): clean up, depending on the value of im.
 	// The specification's algorithm does clean up on reading an EOF 'token',
 	// but in go we represent EOF by an os.Error instead.
 	return p.doc, nil
 }
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@ -0,0 +1,153 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package html
 import (
 	"bufio"
 	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"os"
 	"testing"
 )
 type devNull struct{}
 func (devNull) Write(p []byte) (int, os.Error) {
 	return len(p), nil
 }
 func pipeErr(err os.Error) io.Reader {
 	pr, pw := io.Pipe()
 	pw.CloseWithError(err)
 	return pr
 }
 func readDat(filename string, c chan io.Reader) {
 	f, err := os.Open("testdata/webkit/"+filename, os.O_RDONLY, 0600)
 	if err != nil {
 		c <- pipeErr(err)
 		return
 	}
 	defer f.Close()
 	// Loop through the lines of the file. Each line beginning with "#" denotes
 	// a new section, which is returned as a separate io.Reader.
 	r := bufio.NewReader(f)
 	var pw *io.PipeWriter
 	for {
 		line, err := r.ReadSlice('\n')
 		if err != nil {
 			if pw != nil {
 				pw.CloseWithError(err)
 				pw = nil
 			} else {
 				c <- pipeErr(err)
 			}
 			return
 		}
 		if len(line) == 0 {
 			continue
 		}
 		if line[0] == '#' {
 			if pw != nil {
 				pw.Close()
 			}
 			var pr *io.PipeReader
 			pr, pw = io.Pipe()
 			c <- pr
 			continue
 		}
 		if line[0] != '|' {
 			// Strip the trailing '\n'.
 			line = line[:len(line)-1]
 		}
 		if pw != nil {
 			if _, err := pw.Write(line); err != nil {
 				pw.CloseWithError(err)
 				pw = nil
 			}
 		}
 	}
 }
 func dumpLevel(w io.Writer, n *Node, level int) os.Error {
 	io.WriteString(w, "| ")
 	for i := 0; i < level; i++ {
 		io.WriteString(w, "  ")
 	}
 	switch n.Type {
 	case ErrorNode:
 		return os.NewError("unexpected ErrorNode")
 	case DocumentNode:
 		return os.NewError("unexpected DocumentNode")
 	case ElementNode:
 		fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
 	case TextNode:
 		fmt.Fprintf(w, "%q", EscapeString(n.Data))
 	case CommentNode:
 		return os.NewError("COMMENT")
 	default:
 		return os.NewError("unknown node type")
 	}
 	io.WriteString(w, "\n")
 	for _, c := range n.Child {
 		if err := dumpLevel(w, c, level+1); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 func dump(n *Node) (string, os.Error) {
 	if n == nil || len(n.Child) == 0 {
 		return "", nil
 	}
 	if len(n.Child) > 1 {
 		return "too many children", nil
 	}
 	b := bytes.NewBuffer(nil)
 	if err := dumpLevel(b, n.Child[0], 0); err != nil {
 		return "", err
 	}
 	return b.String(), nil
 }
 func TestParser(t *testing.T) {
 	// TODO(nigeltao): Process all the .dat files, not just the first one.
 	filenames := []string{
 		"tests1.dat",
 	}
 	for _, filename := range filenames {
 		rc := make(chan io.Reader)
 		go readDat(filename, rc)
 		// TODO(nigeltao): Process all test cases, not just the first three.
 		for i := 0; i < 3; i++ {
 			// Parse the #data section.
 			doc, err := Parse(<-rc)
 			if err != nil {
 				t.Fatal(err)
 			}
 			actual, err := dump(doc)
 			if err != nil {
 				t.Fatal(err)
 			}
 			// Skip the #error section.
 			if _, err := io.Copy(devNull{}, <-rc); err != nil {
 				t.Fatal(err)
 			}
 			// Compare the parsed tree to the #document section.
 			b, err := ioutil.ReadAll(<-rc)
 			if err != nil {
 				t.Fatal(err)
 			}
 			expected := string(b)
 			if actual != expected {
 				t.Errorf("%s test #%d, actual vs expected:\n----\n%s----\n%s----", filename, i, actual, expected)
 			}
 		}
 	}
 }
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -15,30 +15,30 @@ import (
 type TokenType int
 const (
-	// Error means that an error occurred during tokenization.
+	// ErrorToken means that an error occurred during tokenization.
-	Error TokenType = iota
+	ErrorToken TokenType = iota
-	// Text means a text node.
+	// TextToken means a text node.
-	Text
+	TextToken
-	// A StartTag looks like <a>.
+	// A StartTagToken looks like <a>.
-	StartTag
+	StartTagToken
-	// An EndTag looks like </a>.
+	// An EndTagToken looks like </a>.
-	EndTag
+	EndTagToken
-	// A SelfClosingTag tag looks like <br/>.
+	// A SelfClosingTagToken tag looks like <br/>.
-	SelfClosingTag
+	SelfClosingTagToken
 )
 // String returns a string representation of the TokenType.
 func (t TokenType) String() string {
 	switch t {
-	case Error:
+	case ErrorToken:
 		return "Error"
-	case Text:
+	case TextToken:
 		return "Text"
-	case StartTag:
+	case StartTagToken:
 		return "StartTag"
-	case EndTag:
+	case EndTagToken:
 		return "EndTag"
-	case SelfClosingTag:
+	case SelfClosingTagToken:
 		return "SelfClosingTag"
 	}
 	return "Invalid(" + strconv.Itoa(int(t)) + ")"
@ -81,15 +81,15 @@ func (t Token) tagString() string {
 // String returns a string representation of the Token.
 func (t Token) String() string {
 	switch t.Type {
-	case Error:
+	case ErrorToken:
 		return ""
-	case Text:
+	case TextToken:
 		return EscapeString(t.Data)
-	case StartTag:
+	case StartTagToken:
 		return "<" + t.tagString() + ">"
-	case EndTag:
+	case EndTagToken:
 		return "</" + t.tagString() + ">"
-	case SelfClosingTag:
+	case SelfClosingTagToken:
 		return "<" + t.tagString() + "/>"
 	}
 	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
@ -109,10 +109,10 @@ type Tokenizer struct {
 	buf    []byte
 }
-// Error returns the error associated with the most recent Error token. This is
+// Error returns the error associated with the most recent ErrorToken token.
-// typically os.EOF, meaning the end of tokenization.
+// This is typically os.EOF, meaning the end of tokenization.
 func (z *Tokenizer) Error() os.Error {
-	if z.tt != Error {
+	if z.tt != ErrorToken {
 		return nil
 	}
 	return z.err
@ -180,40 +180,40 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
 func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 	c, err := z.readByte()
 	if err != nil {
-		return Error, err
+		return ErrorToken, err
 	}
 	switch {
 	case c == '/':
-		tt = EndTag
+		tt = EndTagToken
 	// Lower-cased characters are more common in tag names, so we check for them first.
 	case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-		tt = StartTag
+		tt = StartTagToken
 	case c == '!':
-		return Error, os.NewError("html: TODO(nigeltao): implement comments")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
 	case c == '?':
-		return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
 	default:
-		return Error, os.NewError("html: TODO(nigeltao): handle malformed tags")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags")
 	}
 	for {
 		c, err := z.readByte()
 		if err != nil {
-			return Text, err
+			return TextToken, err
 		}
 		switch c {
 		case '"':
 			err = z.readTo('"')
 			if err != nil {
-				return Text, err
+				return TextToken, err
 			}
 		case '\'':
 			err = z.readTo('\'')
 			if err != nil {
-				return Text, err
+				return TextToken, err
 			}
 		case '>':
-			if z.buf[z.p1-2] == '/' && tt == StartTag {
+			if z.buf[z.p1-2] == '/' && tt == StartTagToken {
-				return SelfClosingTag, nil
+				return SelfClosingTagToken, nil
 			}
 			return tt, nil
 		}
@ -224,13 +224,13 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
 	if z.err != nil {
-		z.tt = Error
+		z.tt = ErrorToken
 		return z.tt
 	}
 	z.p0 = z.p1
 	c, err := z.readByte()
 	if err != nil {
-		z.tt, z.err = Error, err
+		z.tt, z.err = ErrorToken, err
 		return z.tt
 	}
 	if c == '<' {
@ -240,15 +240,15 @@ func (z *Tokenizer) Next() TokenType {
 	for {
 		c, err := z.readByte()
 		if err != nil {
-			z.tt, z.err = Error, err
+			z.tt, z.err = ErrorToken, err
 			if err == os.EOF {
-				z.tt = Text
+				z.tt = TextToken
 			}
 			return z.tt
 		}
 		if c == '<' {
 			z.p1--
-			z.tt = Text
+			z.tt = TextToken
 			return z.tt
 		}
 	}
@ -371,9 +371,9 @@ loop:
 func (z *Tokenizer) Token() Token {
 	t := Token{Type: z.tt}
 	switch z.tt {
-	case Text:
+	case TextToken:
 		t.Data = string(z.Text())
-	case StartTag, EndTag, SelfClosingTag:
+	case StartTagToken, EndTagToken, SelfClosingTagToken:
 		var attr []Attribute
 		name, remaining := z.TagName()
 		for remaining {
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -88,7 +88,7 @@ loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
 		for i, s := range tt.tokens {
-			if z.Next() == Error {
+			if z.Next() == ErrorToken {
 				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
 				continue loop
 			}
@ -134,19 +134,19 @@ loop:
 	for {
 		tt := z.Next()
 		switch tt {
-		case Error:
+		case ErrorToken:
 			if z.Error() != os.EOF {
 				t.Error(z.Error())
 			}
 			break loop
-		case Text:
+		case TextToken:
 			if depth > 0 {
 				result.Write(z.Text())
 			}
-		case StartTag, EndTag:
+		case StartTagToken, EndTagToken:
 			tn, _ := z.TagName()
 			if len(tn) == 1 && tn[0] == 'a' {
-				if tt == StartTag {
+				if tt == StartTagToken {
 					depth++
 				} else {
 					depth--