mirror of
https://github.com/golang/go
synced 2024-10-03 21:21:22 -06:00
exp/html: make the tokenizer return atoms for tag tokens.
This is part 1 of a 2 part changelist. Part 2 contains the mechanical change to parse.go to compare atoms (ints) instead of strings. The overall effect of the two changes are: benchmark old ns/op new ns/op delta BenchmarkParser 4462274 4058254 -9.05% BenchmarkRawLevelTokenizer 913202 912917 -0.03% BenchmarkLowLevelTokenizer 1268626 1267836 -0.06% BenchmarkHighLevelTokenizer 1947305 1968944 +1.11% R=rsc CC=andybalholm, golang-dev, r https://golang.org/cl/6305053
This commit is contained in:
parent
6423682019
commit
cd21eff705
@ -4,8 +4,12 @@
|
|||||||
|
|
||||||
package html
|
package html
|
||||||
|
|
||||||
|
import (
|
||||||
|
"exp/html/atom"
|
||||||
|
)
|
||||||
|
|
||||||
// A NodeType is the type of a Node.
|
// A NodeType is the type of a Node.
|
||||||
type NodeType int
|
type NodeType uint32
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ErrorNode NodeType = iota
|
ErrorNode NodeType = iota
|
||||||
@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
|
|||||||
// A Node consists of a NodeType and some Data (tag name for element nodes,
|
// A Node consists of a NodeType and some Data (tag name for element nodes,
|
||||||
// content for text) and are part of a tree of Nodes. Element nodes may also
|
// content for text) and are part of a tree of Nodes. Element nodes may also
|
||||||
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
|
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
|
||||||
// that it looks like "a<b" rather than "a<b".
|
// that it looks like "a<b" rather than "a<b". For element nodes, DataAtom
|
||||||
|
// is the atom for Data, or zero if Data is not a known tag name.
|
||||||
//
|
//
|
||||||
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
|
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
|
||||||
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
|
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
|
||||||
@ -34,6 +39,7 @@ type Node struct {
|
|||||||
Parent *Node
|
Parent *Node
|
||||||
Child []*Node
|
Child []*Node
|
||||||
Type NodeType
|
Type NodeType
|
||||||
|
DataAtom atom.Atom
|
||||||
Data string
|
Data string
|
||||||
Namespace string
|
Namespace string
|
||||||
Attr []Attribute
|
Attr []Attribute
|
||||||
@ -83,9 +89,10 @@ func reparentChildren(dst, src *Node) {
|
|||||||
// The clone has no parent and no children.
|
// The clone has no parent and no children.
|
||||||
func (n *Node) clone() *Node {
|
func (n *Node) clone() *Node {
|
||||||
m := &Node{
|
m := &Node{
|
||||||
Type: n.Type,
|
Type: n.Type,
|
||||||
Data: n.Data,
|
DataAtom: n.DataAtom,
|
||||||
Attr: make([]Attribute, len(n.Attr)),
|
Data: n.Data,
|
||||||
|
Attr: make([]Attribute, len(n.Attr)),
|
||||||
}
|
}
|
||||||
copy(m.Attr, n.Attr)
|
copy(m.Attr, n.Attr)
|
||||||
return m
|
return m
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
package html
|
package html
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
a "exp/html/atom"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
|
|||||||
func (p *parser) addElement(tag string, attr []Attribute) {
|
func (p *parser) addElement(tag string, attr []Attribute) {
|
||||||
p.addChild(&Node{
|
p.addChild(&Node{
|
||||||
Type: ElementNode,
|
Type: ElementNode,
|
||||||
Data: tag,
|
Data: tag, // TODO: also set DataAtom.
|
||||||
Attr: attr,
|
Attr: attr,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -310,9 +311,9 @@ findIdenticalElements:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
compareAttributes:
|
compareAttributes:
|
||||||
for _, a := range n.Attr {
|
for _, t0 := range n.Attr {
|
||||||
for _, b := range attr {
|
for _, t1 := range attr {
|
||||||
if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val {
|
if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
|
||||||
// Found a match for this attribute, continue with the next attribute.
|
// Found a match for this attribute, continue with the next attribute.
|
||||||
continue compareAttributes
|
continue compareAttributes
|
||||||
}
|
}
|
||||||
@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
attr := map[string]string{}
|
attr := map[string]string{}
|
||||||
for _, a := range dst.Attr {
|
for _, t := range dst.Attr {
|
||||||
attr[a.Key] = a.Val
|
attr[t.Key] = t.Val
|
||||||
}
|
}
|
||||||
for _, a := range src.Attr {
|
for _, t := range src.Attr {
|
||||||
if _, ok := attr[a.Key]; !ok {
|
if _, ok := attr[t.Key]; !ok {
|
||||||
dst.Attr = append(dst.Attr, a)
|
dst.Attr = append(dst.Attr, t)
|
||||||
attr[a.Key] = a.Val
|
attr[t.Key] = t.Val
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
|
|||||||
p.oe.pop()
|
p.oe.pop()
|
||||||
p.acknowledgeSelfClosingTag()
|
p.acknowledgeSelfClosingTag()
|
||||||
if p.tok.Data == "input" {
|
if p.tok.Data == "input" {
|
||||||
for _, a := range p.tok.Attr {
|
for _, t := range p.tok.Attr {
|
||||||
if a.Key == "type" {
|
if t.Key == "type" {
|
||||||
if strings.ToLower(a.Val) == "hidden" {
|
if strings.ToLower(t.Val) == "hidden" {
|
||||||
// Skip setting framesetOK = false
|
// Skip setting framesetOK = false
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
|
|||||||
action := ""
|
action := ""
|
||||||
prompt := "This is a searchable index. Enter search keywords: "
|
prompt := "This is a searchable index. Enter search keywords: "
|
||||||
attr := []Attribute{{Key: "name", Val: "isindex"}}
|
attr := []Attribute{{Key: "name", Val: "isindex"}}
|
||||||
for _, a := range p.tok.Attr {
|
for _, t := range p.tok.Attr {
|
||||||
switch a.Key {
|
switch t.Key {
|
||||||
case "action":
|
case "action":
|
||||||
action = a.Val
|
action = t.Val
|
||||||
case "name":
|
case "name":
|
||||||
// Ignore the attribute.
|
// Ignore the attribute.
|
||||||
case "prompt":
|
case "prompt":
|
||||||
prompt = a.Val
|
prompt = t.Val
|
||||||
default:
|
default:
|
||||||
attr = append(attr, a)
|
attr = append(attr, t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p.acknowledgeSelfClosingTag()
|
p.acknowledgeSelfClosingTag()
|
||||||
@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
|
|||||||
case "style", "script":
|
case "style", "script":
|
||||||
return inHeadIM(p)
|
return inHeadIM(p)
|
||||||
case "input":
|
case "input":
|
||||||
for _, a := range p.tok.Attr {
|
for _, t := range p.tok.Attr {
|
||||||
if a.Key == "type" && strings.ToLower(a.Val) == "hidden" {
|
if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
|
||||||
p.addElement(p.tok.Data, p.tok.Attr)
|
p.addElement(p.tok.Data, p.tok.Attr)
|
||||||
p.oe.pop()
|
p.oe.pop()
|
||||||
return true
|
return true
|
||||||
@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
|
|||||||
// Adjust SVG tag names. The tokenizer lower-cases tag names, but
|
// Adjust SVG tag names. The tokenizer lower-cases tag names, but
|
||||||
// SVG wants e.g. "foreignObject" with a capital second "O".
|
// SVG wants e.g. "foreignObject" with a capital second "O".
|
||||||
if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
|
if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
|
||||||
|
p.tok.DataAtom = a.Lookup([]byte(x))
|
||||||
p.tok.Data = x
|
p.tok.Data = x
|
||||||
}
|
}
|
||||||
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
|
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
|
||||||
@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
|
|||||||
realToken, selfClosing := p.tok, p.hasSelfClosingToken
|
realToken, selfClosing := p.tok, p.hasSelfClosingToken
|
||||||
p.tok = Token{
|
p.tok = Token{
|
||||||
Type: t,
|
Type: t,
|
||||||
Data: data,
|
Data: data, // TODO: also set DataAtom.
|
||||||
Attr: attr,
|
Attr: attr,
|
||||||
}
|
}
|
||||||
p.hasSelfClosingToken = false
|
p.hasSelfClosingToken = false
|
||||||
@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
|
|||||||
|
|
||||||
root := &Node{
|
root := &Node{
|
||||||
Type: ElementNode,
|
Type: ElementNode,
|
||||||
Data: "html",
|
Data: "html", // TODO: also set DataAtom.
|
||||||
}
|
}
|
||||||
p.doc.Add(root)
|
p.doc.Add(root)
|
||||||
p.oe = nodeStack{root}
|
p.oe = nodeStack{root}
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"errors"
|
"errors"
|
||||||
|
"exp/html/atom"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@ -320,8 +321,9 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
contextNode := &Node{
|
contextNode := &Node{
|
||||||
Type: ElementNode,
|
Type: ElementNode,
|
||||||
Data: context,
|
DataAtom: atom.Lookup([]byte(context)),
|
||||||
|
Data: context,
|
||||||
}
|
}
|
||||||
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
|
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -13,7 +13,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// A TokenType is the type of a Token.
|
// A TokenType is the type of a Token.
|
||||||
type TokenType int
|
type TokenType uint32
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// ErrorToken means that an error occurred during tokenization.
|
// ErrorToken means that an error occurred during tokenization.
|
||||||
@ -66,11 +66,13 @@ type Attribute struct {
|
|||||||
// A Token consists of a TokenType and some Data (tag name for start and end
|
// A Token consists of a TokenType and some Data (tag name for start and end
|
||||||
// tags, content for text, comments and doctypes). A tag Token may also contain
|
// tags, content for text, comments and doctypes). A tag Token may also contain
|
||||||
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
|
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
|
||||||
// rather than "a<b").
|
// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or
|
||||||
|
// zero if Data is not a known tag name.
|
||||||
type Token struct {
|
type Token struct {
|
||||||
Type TokenType
|
Type TokenType
|
||||||
Data string
|
DataAtom atom.Atom
|
||||||
Attr []Attribute
|
Data string
|
||||||
|
Attr []Attribute
|
||||||
}
|
}
|
||||||
|
|
||||||
// tagString returns a string representation of a tag Token's Data and Attr.
|
// tagString returns a string representation of a tag Token's Data and Attr.
|
||||||
@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
|
|||||||
key, val, moreAttr = z.TagAttr()
|
key, val, moreAttr = z.TagAttr()
|
||||||
attr = append(attr, Attribute{"", atom.String(key), string(val)})
|
attr = append(attr, Attribute{"", atom.String(key), string(val)})
|
||||||
}
|
}
|
||||||
t.Data = atom.String(name)
|
if a := atom.Lookup(name); a != 0 {
|
||||||
|
t.DataAtom, t.Data = a, a.String()
|
||||||
|
} else {
|
||||||
|
t.DataAtom, t.Data = 0, string(name)
|
||||||
|
}
|
||||||
t.Attr = attr
|
t.Attr = attr
|
||||||
case EndTagToken:
|
case EndTagToken:
|
||||||
name, _ := z.TagName()
|
name, _ := z.TagName()
|
||||||
t.Data = atom.String(name)
|
if a := atom.Lookup(name); a != 0 {
|
||||||
|
t.DataAtom, t.Data = a, a.String()
|
||||||
|
} else {
|
||||||
|
t.DataAtom, t.Data = 0, string(name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return t
|
return t
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user