mirror of
https://github.com/golang/go
synced 2024-11-24 23:27:57 -07:00
exp/html: make the tokenizer return atoms for tag tokens.
This is part 1 of a 2 part changelist. Part 2 contains the mechanical change to parse.go to compare atoms (ints) instead of strings. The overall effect of the two changes are: benchmark old ns/op new ns/op delta BenchmarkParser 4462274 4058254 -9.05% BenchmarkRawLevelTokenizer 913202 912917 -0.03% BenchmarkLowLevelTokenizer 1268626 1267836 -0.06% BenchmarkHighLevelTokenizer 1947305 1968944 +1.11% R=rsc CC=andybalholm, golang-dev, r https://golang.org/cl/6305053
This commit is contained in:
parent
6423682019
commit
cd21eff705
@ -4,8 +4,12 @@
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"exp/html/atom"
|
||||
)
|
||||
|
||||
// A NodeType is the type of a Node.
|
||||
type NodeType int
|
||||
type NodeType uint32
|
||||
|
||||
const (
|
||||
ErrorNode NodeType = iota
|
||||
@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
|
||||
// A Node consists of a NodeType and some Data (tag name for element nodes,
|
||||
// content for text) and are part of a tree of Nodes. Element nodes may also
|
||||
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
|
||||
// that it looks like "a<b" rather than "a<b".
|
||||
// that it looks like "a<b" rather than "a<b". For element nodes, DataAtom
|
||||
// is the atom for Data, or zero if Data is not a known tag name.
|
||||
//
|
||||
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
|
||||
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
|
||||
@ -34,6 +39,7 @@ type Node struct {
|
||||
Parent *Node
|
||||
Child []*Node
|
||||
Type NodeType
|
||||
DataAtom atom.Atom
|
||||
Data string
|
||||
Namespace string
|
||||
Attr []Attribute
|
||||
@ -84,6 +90,7 @@ func reparentChildren(dst, src *Node) {
|
||||
func (n *Node) clone() *Node {
|
||||
m := &Node{
|
||||
Type: n.Type,
|
||||
DataAtom: n.DataAtom,
|
||||
Data: n.Data,
|
||||
Attr: make([]Attribute, len(n.Attr)),
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
a "exp/html/atom"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
|
||||
func (p *parser) addElement(tag string, attr []Attribute) {
|
||||
p.addChild(&Node{
|
||||
Type: ElementNode,
|
||||
Data: tag,
|
||||
Data: tag, // TODO: also set DataAtom.
|
||||
Attr: attr,
|
||||
})
|
||||
}
|
||||
@ -310,9 +311,9 @@ findIdenticalElements:
|
||||
continue
|
||||
}
|
||||
compareAttributes:
|
||||
for _, a := range n.Attr {
|
||||
for _, b := range attr {
|
||||
if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val {
|
||||
for _, t0 := range n.Attr {
|
||||
for _, t1 := range attr {
|
||||
if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
|
||||
// Found a match for this attribute, continue with the next attribute.
|
||||
continue compareAttributes
|
||||
}
|
||||
@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
|
||||
return
|
||||
}
|
||||
attr := map[string]string{}
|
||||
for _, a := range dst.Attr {
|
||||
attr[a.Key] = a.Val
|
||||
for _, t := range dst.Attr {
|
||||
attr[t.Key] = t.Val
|
||||
}
|
||||
for _, a := range src.Attr {
|
||||
if _, ok := attr[a.Key]; !ok {
|
||||
dst.Attr = append(dst.Attr, a)
|
||||
attr[a.Key] = a.Val
|
||||
for _, t := range src.Attr {
|
||||
if _, ok := attr[t.Key]; !ok {
|
||||
dst.Attr = append(dst.Attr, t)
|
||||
attr[t.Key] = t.Val
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
|
||||
p.oe.pop()
|
||||
p.acknowledgeSelfClosingTag()
|
||||
if p.tok.Data == "input" {
|
||||
for _, a := range p.tok.Attr {
|
||||
if a.Key == "type" {
|
||||
if strings.ToLower(a.Val) == "hidden" {
|
||||
for _, t := range p.tok.Attr {
|
||||
if t.Key == "type" {
|
||||
if strings.ToLower(t.Val) == "hidden" {
|
||||
// Skip setting framesetOK = false
|
||||
return true
|
||||
}
|
||||
@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
|
||||
action := ""
|
||||
prompt := "This is a searchable index. Enter search keywords: "
|
||||
attr := []Attribute{{Key: "name", Val: "isindex"}}
|
||||
for _, a := range p.tok.Attr {
|
||||
switch a.Key {
|
||||
for _, t := range p.tok.Attr {
|
||||
switch t.Key {
|
||||
case "action":
|
||||
action = a.Val
|
||||
action = t.Val
|
||||
case "name":
|
||||
// Ignore the attribute.
|
||||
case "prompt":
|
||||
prompt = a.Val
|
||||
prompt = t.Val
|
||||
default:
|
||||
attr = append(attr, a)
|
||||
attr = append(attr, t)
|
||||
}
|
||||
}
|
||||
p.acknowledgeSelfClosingTag()
|
||||
@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
|
||||
case "style", "script":
|
||||
return inHeadIM(p)
|
||||
case "input":
|
||||
for _, a := range p.tok.Attr {
|
||||
if a.Key == "type" && strings.ToLower(a.Val) == "hidden" {
|
||||
for _, t := range p.tok.Attr {
|
||||
if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
|
||||
p.addElement(p.tok.Data, p.tok.Attr)
|
||||
p.oe.pop()
|
||||
return true
|
||||
@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
|
||||
// Adjust SVG tag names. The tokenizer lower-cases tag names, but
|
||||
// SVG wants e.g. "foreignObject" with a capital second "O".
|
||||
if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
|
||||
p.tok.DataAtom = a.Lookup([]byte(x))
|
||||
p.tok.Data = x
|
||||
}
|
||||
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
|
||||
@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
|
||||
realToken, selfClosing := p.tok, p.hasSelfClosingToken
|
||||
p.tok = Token{
|
||||
Type: t,
|
||||
Data: data,
|
||||
Data: data, // TODO: also set DataAtom.
|
||||
Attr: attr,
|
||||
}
|
||||
p.hasSelfClosingToken = false
|
||||
@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
|
||||
|
||||
root := &Node{
|
||||
Type: ElementNode,
|
||||
Data: "html",
|
||||
Data: "html", // TODO: also set DataAtom.
|
||||
}
|
||||
p.doc.Add(root)
|
||||
p.oe = nodeStack{root}
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"exp/html/atom"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@ -321,6 +322,7 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
|
||||
} else {
|
||||
contextNode := &Node{
|
||||
Type: ElementNode,
|
||||
DataAtom: atom.Lookup([]byte(context)),
|
||||
Data: context,
|
||||
}
|
||||
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
|
||||
|
@ -13,7 +13,7 @@ import (
|
||||
)
|
||||
|
||||
// A TokenType is the type of a Token.
|
||||
type TokenType int
|
||||
type TokenType uint32
|
||||
|
||||
const (
|
||||
// ErrorToken means that an error occurred during tokenization.
|
||||
@ -66,9 +66,11 @@ type Attribute struct {
|
||||
// A Token consists of a TokenType and some Data (tag name for start and end
|
||||
// tags, content for text, comments and doctypes). A tag Token may also contain
|
||||
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
|
||||
// rather than "a<b").
|
||||
// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or
|
||||
// zero if Data is not a known tag name.
|
||||
type Token struct {
|
||||
Type TokenType
|
||||
DataAtom atom.Atom
|
||||
Data string
|
||||
Attr []Attribute
|
||||
}
|
||||
@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
|
||||
key, val, moreAttr = z.TagAttr()
|
||||
attr = append(attr, Attribute{"", atom.String(key), string(val)})
|
||||
}
|
||||
t.Data = atom.String(name)
|
||||
if a := atom.Lookup(name); a != 0 {
|
||||
t.DataAtom, t.Data = a, a.String()
|
||||
} else {
|
||||
t.DataAtom, t.Data = 0, string(name)
|
||||
}
|
||||
t.Attr = attr
|
||||
case EndTagToken:
|
||||
name, _ := z.TagName()
|
||||
t.Data = atom.String(name)
|
||||
if a := atom.Lookup(name); a != 0 {
|
||||
t.DataAtom, t.Data = a, a.String()
|
||||
} else {
|
||||
t.DataAtom, t.Data = 0, string(name)
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user