exp/html: make the tokenizer return atoms for tag tokens.

This is part 1 of a 2 part changelist. Part 2 contains the mechanical change to parse.go to compare atoms (ints) instead of strings. The overall effect of the two changes are: benchmark old ns/op new ns/op delta BenchmarkParser 4462274 4058254 -9.05% BenchmarkRawLevelTokenizer 913202 912917 -0.03% BenchmarkLowLevelTokenizer 1268626 1267836 -0.06% BenchmarkHighLevelTokenizer 1947305 1968944 +1.11% R=rsc CC=andybalholm, golang-dev, r https://golang.org/cl/6305053
2024-11-24 23:27:57 -07:00 · 2012-06-07 13:05:35 +10:00 · 2012-06-07 13:05:35 +10:00 · cd21eff705
commit cd21eff705
parent 6423682019
4 changed files with 57 additions and 36 deletions
--- a/src/pkg/exp/html/node.go
+++ b/src/pkg/exp/html/node.go
@ -4,8 +4,12 @@

 package html

+import (
+	"exp/html/atom"
+)
+
 // A NodeType is the type of a Node.
-type NodeType int
+type NodeType uint32

 const (
 	ErrorNode NodeType = iota
@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
 // A Node consists of a NodeType and some Data (tag name for element nodes,
 // content for text) and are part of a tree of Nodes. Element nodes may also
 // have a Namespace and contain a slice of Attributes. Data is unescaped, so
-// that it looks like "a<b" rather than "a&lt;b".
+// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
+// is the atom for Data, or zero if Data is not a known tag name.
 //
 // An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
 // Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
@ -34,6 +39,7 @@ type Node struct {
 	Parent    *Node
 	Child     []*Node
 	Type      NodeType
+	DataAtom  atom.Atom
 	Data      string
 	Namespace string
 	Attr      []Attribute
@ -84,6 +90,7 @@ func reparentChildren(dst, src *Node) {
 func (n *Node) clone() *Node {
 	m := &Node{
 		Type:     n.Type,
+		DataAtom: n.DataAtom,
 		Data:     n.Data,
 		Attr:     make([]Attribute, len(n.Attr)),
 	}
--- a/src/pkg/exp/html/parse.go
+++ b/src/pkg/exp/html/parse.go
@ -5,6 +5,7 @@
 package html

 import (
+	a "exp/html/atom"
 	"io"
 	"strings"
 )
@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
 func (p *parser) addElement(tag string, attr []Attribute) {
 	p.addChild(&Node{
 		Type: ElementNode,
-		Data: tag,
+		Data: tag, // TODO: also set DataAtom.
 		Attr: attr,
 	})
 }
@ -310,9 +311,9 @@ findIdenticalElements:
 			continue
 		}
 	compareAttributes:
-		for _, a := range n.Attr {
-			for _, b := range attr {
-				if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val {
+		for _, t0 := range n.Attr {
+			for _, t1 := range attr {
+				if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
 					// Found a match for this attribute, continue with the next attribute.
 					continue compareAttributes
 				}
@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
 		return
 	}
 	attr := map[string]string{}
-	for _, a := range dst.Attr {
-		attr[a.Key] = a.Val
+	for _, t := range dst.Attr {
+		attr[t.Key] = t.Val
 	}
-	for _, a := range src.Attr {
-		if _, ok := attr[a.Key]; !ok {
-			dst.Attr = append(dst.Attr, a)
-			attr[a.Key] = a.Val
+	for _, t := range src.Attr {
+		if _, ok := attr[t.Key]; !ok {
+			dst.Attr = append(dst.Attr, t)
+			attr[t.Key] = t.Val
 		}
 	}
 }
@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
 			p.oe.pop()
 			p.acknowledgeSelfClosingTag()
 			if p.tok.Data == "input" {
-				for _, a := range p.tok.Attr {
-					if a.Key == "type" {
-						if strings.ToLower(a.Val) == "hidden" {
+				for _, t := range p.tok.Attr {
+					if t.Key == "type" {
+						if strings.ToLower(t.Val) == "hidden" {
 							// Skip setting framesetOK = false
 							return true
 						}
@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
 			action := ""
 			prompt := "This is a searchable index. Enter search keywords: "
 			attr := []Attribute{{Key: "name", Val: "isindex"}}
-			for _, a := range p.tok.Attr {
-				switch a.Key {
+			for _, t := range p.tok.Attr {
+				switch t.Key {
 				case "action":
-					action = a.Val
+					action = t.Val
 				case "name":
 					// Ignore the attribute.
 				case "prompt":
-					prompt = a.Val
+					prompt = t.Val
 				default:
-					attr = append(attr, a)
+					attr = append(attr, t)
 				}
 			}
 			p.acknowledgeSelfClosingTag()
@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
 		case "style", "script":
 			return inHeadIM(p)
 		case "input":
-			for _, a := range p.tok.Attr {
-				if a.Key == "type" && strings.ToLower(a.Val) == "hidden" {
+			for _, t := range p.tok.Attr {
+				if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
 					p.addElement(p.tok.Data, p.tok.Attr)
 					p.oe.pop()
 					return true
@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
 			// Adjust SVG tag names. The tokenizer lower-cases tag names, but
 			// SVG wants e.g. "foreignObject" with a capital second "O".
 			if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
+				p.tok.DataAtom = a.Lookup([]byte(x))
 				p.tok.Data = x
 			}
 			adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
 	realToken, selfClosing := p.tok, p.hasSelfClosingToken
 	p.tok = Token{
 		Type: t,
-		Data: data,
+		Data: data, // TODO: also set DataAtom.
 		Attr: attr,
 	}
 	p.hasSelfClosingToken = false
@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {

 	root := &Node{
 		Type: ElementNode,
-		Data: "html",
+		Data: "html", // TODO: also set DataAtom.
 	}
 	p.doc.Add(root)
 	p.oe = nodeStack{root}
--- a/src/pkg/exp/html/parse_test.go
+++ b/src/pkg/exp/html/parse_test.go
@ -8,6 +8,7 @@ import (
 	"bufio"
 	"bytes"
 	"errors"
+	"exp/html/atom"
 	"flag"
 	"fmt"
 	"io"
@ -321,6 +322,7 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
 	} else {
 		contextNode := &Node{
 			Type:     ElementNode,
+			DataAtom: atom.Lookup([]byte(context)),
 			Data:     context,
 		}
 		nodes, err := ParseFragment(strings.NewReader(text), contextNode)
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@ -13,7 +13,7 @@ import (
 )

 // A TokenType is the type of a Token.
-type TokenType int
+type TokenType uint32

 const (
 	// ErrorToken means that an error occurred during tokenization.
@ -66,9 +66,11 @@ type Attribute struct {
 // A Token consists of a TokenType and some Data (tag name for start and end
 // tags, content for text, comments and doctypes). A tag Token may also contain
 // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
-// rather than "a&lt;b").
+// rather than "a&lt;b"). For tag Tokens, DataAtom is the atom for Data, or
+// zero if Data is not a known tag name.
 type Token struct {
 	Type     TokenType
+	DataAtom atom.Atom
 	Data     string
 	Attr     []Attribute
 }
@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
 			key, val, moreAttr = z.TagAttr()
 			attr = append(attr, Attribute{"", atom.String(key), string(val)})
 		}
-		t.Data = atom.String(name)
+		if a := atom.Lookup(name); a != 0 {
+			t.DataAtom, t.Data = a, a.String()
+		} else {
+			t.DataAtom, t.Data = 0, string(name)
+		}
 		t.Attr = attr
 	case EndTagToken:
 		name, _ := z.TagName()
-		t.Data = atom.String(name)
+		if a := atom.Lookup(name); a != 0 {
+			t.DataAtom, t.Data = a, a.String()
+		} else {
+			t.DataAtom, t.Data = 0, string(name)
+		}
 	}
 	return t
 }