mirror of
https://github.com/golang/go
synced 2024-11-20 11:04:56 -07:00
exp/html/atom: faster, hash-based lookup.
exp/html/atom benchmark: benchmark old ns/op new ns/op delta BenchmarkLookup 199226 80770 -59.46% exp/html benchmark: benchmark old ns/op new ns/op delta BenchmarkParser 4864890 4510834 -7.28% BenchmarkHighLevelTokenizer 2209192 1969684 -10.84% benchmark old MB/s new MB/s speedup BenchmarkParser 16.07 17.33 1.08x BenchmarkHighLevelTokenizer 35.38 39.68 1.12x R=r CC=golang-dev https://golang.org/cl/6261054
This commit is contained in:
parent
baf91c313f
commit
d2a6098e9c
@ -6,33 +6,40 @@
|
||||
// frequently occurring HTML strings: lower-case tag names and attribute keys
|
||||
// such as "p" and "id".
|
||||
//
|
||||
// Sharing an atom's string representation between all elements with the same
|
||||
// tag can result in fewer string allocations when tokenizing and parsing HTML.
|
||||
// Integer comparisons are also generally faster than string comparisons.
|
||||
// Sharing an atom's name between all elements with the same tag can result in
|
||||
// fewer string allocations when tokenizing and parsing HTML. Integer
|
||||
// comparisons are also generally faster than string comparisons.
|
||||
//
|
||||
// An atom's particular code (such as atom.Div == 63) is not guaranteed to
|
||||
// stay the same between versions of this package. Neither is any ordering
|
||||
// guaranteed: whether atom.H1 < atom.H2 may also change. The codes are not
|
||||
// guaranteed to be dense. The only guarantees are that e.g. looking up "div"
|
||||
// will yield atom.Div, calling atom.Div.String will return "div", and
|
||||
// atom.Div != 0.
|
||||
// The value of an atom's particular code is not guaranteed to stay the same
|
||||
// between versions of this package. Neither is any ordering guaranteed:
|
||||
// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
|
||||
// be dense. The only guarantees are that e.g. looking up "div" will yield
|
||||
// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
|
||||
package atom
|
||||
|
||||
// The hash function must be the same as the one used in gen.go
|
||||
func hash(s []byte) (h uint32) {
|
||||
for i := 0; i < len(s); i++ {
|
||||
h = h<<5 ^ h>>27 ^ uint32(s[i])
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
// Atom is an integer code for a string. The zero value maps to "".
|
||||
type Atom int
|
||||
|
||||
// String returns the atom's string representation.
|
||||
// String returns the atom's name.
|
||||
func (a Atom) String() string {
|
||||
if a <= 0 || a > max {
|
||||
return ""
|
||||
if 0 <= a && a < Atom(len(table)) {
|
||||
return table[a]
|
||||
}
|
||||
return table[a]
|
||||
return ""
|
||||
}
|
||||
|
||||
// Lookup returns the atom whose name is s. It returns zero if there is no
|
||||
// such atom.
|
||||
func Lookup(s []byte) Atom {
|
||||
if len(s) == 0 {
|
||||
if len(s) == 0 || len(s) > maxLen {
|
||||
return 0
|
||||
}
|
||||
if len(s) == 1 {
|
||||
@ -42,15 +49,25 @@ func Lookup(s []byte) Atom {
|
||||
}
|
||||
return oneByteAtoms[x-'a']
|
||||
}
|
||||
// Binary search for the atom. Unlike sort.Search, this returns early on an exact match.
|
||||
// TODO: this could be optimized further. For example, lo and hi could be initialized
|
||||
// from s[0]. Separately, all the "onxxx" atoms could be moved into their own table.
|
||||
lo, hi := Atom(1), 1+max
|
||||
hs := hash(s)
|
||||
// Binary search for hs. Unlike sort.Search, this returns early on an exact match.
|
||||
// A loop invariant is that len(table[i]) == len(s) for all i in [lo, hi).
|
||||
lo := Atom(loHi[len(s)])
|
||||
hi := Atom(loHi[len(s)+1])
|
||||
for lo < hi {
|
||||
mid := (lo + hi) / 2
|
||||
if cmp := compare(s, table[mid]); cmp == 0 {
|
||||
if ht := hashes[mid]; hs == ht {
|
||||
// The gen.go program ensures that each atom's name has a distinct hash.
|
||||
// However, arbitrary strings may collide with the atom's name. We have
|
||||
// to check that string(s) == table[mid].
|
||||
t := table[mid]
|
||||
for i, si := range s {
|
||||
if si != t[i] {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return mid
|
||||
} else if cmp > 0 {
|
||||
} else if hs > ht {
|
||||
lo = mid + 1
|
||||
} else {
|
||||
hi = mid
|
||||
@ -67,22 +84,3 @@ func String(s []byte) string {
|
||||
}
|
||||
return string(s)
|
||||
}
|
||||
|
||||
// compare is like bytes.Compare, except that it takes one []byte argument and
|
||||
// one string argument, and returns negative/0/positive instead of -1/0/+1.
|
||||
func compare(s []byte, t string) int {
|
||||
n := len(s)
|
||||
if n > len(t) {
|
||||
n = len(t)
|
||||
}
|
||||
for i, si := range s[:n] {
|
||||
ti := t[i]
|
||||
switch {
|
||||
case si > ti:
|
||||
return +1
|
||||
case si < ti:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
return len(s) - len(t)
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
package atom
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@ -42,6 +43,8 @@ func TestMisses(t *testing.T) {
|
||||
"h7",
|
||||
"onClick",
|
||||
"λ",
|
||||
// The following string has the same hash (0xa1d7fab7) as "onmouseover".
|
||||
"\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7",
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
got := Lookup([]byte(tc))
|
||||
@ -50,3 +53,21 @@ func TestMisses(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLookup(b *testing.B) {
|
||||
sortedTable := make([]string, len(table))
|
||||
copy(sortedTable, table[:])
|
||||
sort.Strings(sortedTable)
|
||||
|
||||
x := make([][]byte, 1000)
|
||||
for i := range x {
|
||||
x[i] = []byte(sortedTable[i%len(sortedTable)])
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, s := range x {
|
||||
Lookup(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13,9 +13,30 @@ package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// The hash function must be the same as the one used in atom.go
|
||||
func hash(s string) (h uint32) {
|
||||
for i := 0; i < len(s); i++ {
|
||||
h = h<<5 ^ h>>27 ^ uint32(s[i])
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
// lhash returns a uint64 whose high 32 bits are len(s) and whose low 32 bits
|
||||
// are hash(s).
|
||||
func lhash(s string) uint64 {
|
||||
return uint64(len(s))<<32 | uint64(hash(s))
|
||||
}
|
||||
|
||||
type byLhash []string
|
||||
|
||||
func (b byLhash) Len() int { return len(b) }
|
||||
func (b byLhash) Less(i, j int) bool { return lhash(b[i]) < lhash(b[j]) }
|
||||
func (b byLhash) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
||||
// identifier converts s to a Go exported identifier.
|
||||
// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
|
||||
func identifier(s string) string {
|
||||
@ -36,43 +57,84 @@ func identifier(s string) string {
|
||||
}
|
||||
|
||||
func main() {
|
||||
m := map[string]bool{
|
||||
// Construct a list of atoms, sorted by their lhash.
|
||||
m0 := map[string]bool{
|
||||
"": true,
|
||||
}
|
||||
for _, list := range [][]string{elements, attributes, eventHandlers, extra} {
|
||||
for _, s := range list {
|
||||
m[s] = true
|
||||
m0[s] = true
|
||||
}
|
||||
}
|
||||
atoms := make([]string, 0, len(m))
|
||||
for s := range m {
|
||||
atoms := make([]string, 0, len(m0))
|
||||
for s := range m0 {
|
||||
atoms = append(atoms, s)
|
||||
}
|
||||
sort.Strings(atoms)
|
||||
sort.Sort(byLhash(atoms))
|
||||
|
||||
// Calculate the magic constants to output as table.go.
|
||||
byInt := []string{}
|
||||
byStr := map[string]int{}
|
||||
ident := []string{}
|
||||
lhashes := []uint64{}
|
||||
maxLen := 0
|
||||
for i, s := range atoms {
|
||||
byInt = append(byInt, s)
|
||||
byStr[s] = i
|
||||
ident = append(ident, identifier(s))
|
||||
lhashes = append(lhashes, lhash(s))
|
||||
if maxLen < len(s) {
|
||||
maxLen = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("package atom\n\nconst (\n")
|
||||
for i, _ := range byInt {
|
||||
if i == 0 {
|
||||
continue
|
||||
// Check for hash collisions.
|
||||
m1 := map[uint64]int{}
|
||||
for i, h := range lhashes {
|
||||
h &= 1<<32 - 1
|
||||
if j, ok := m1[h]; ok {
|
||||
fmt.Fprintf(os.Stderr, "hash collision at 0x%08x: %q, %q\n", h, byInt[i], byInt[j])
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Printf("\t%s Atom = %d\n", ident[i], i)
|
||||
m1[h] = i
|
||||
}
|
||||
fmt.Printf(")\n\n")
|
||||
fmt.Printf("const max Atom = %d\n\n", len(byInt)-1)
|
||||
fmt.Printf("var table = []string{\n")
|
||||
|
||||
// Generate the Go code.
|
||||
fmt.Printf("package atom\n\nconst (\n")
|
||||
{
|
||||
// Print the Atoms in alphabetical order.
|
||||
lines := []string{}
|
||||
for i, _ := range byInt {
|
||||
if i == 0 {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, fmt.Sprintf("\t%s Atom = %d", ident[i], i))
|
||||
}
|
||||
sort.Strings(lines)
|
||||
for _, line := range lines {
|
||||
fmt.Println(line)
|
||||
}
|
||||
fmt.Printf(")\n\n")
|
||||
}
|
||||
fmt.Printf("const maxLen = %d\n\n", maxLen)
|
||||
fmt.Printf("var table = [...]string{\n")
|
||||
for _, s := range byInt {
|
||||
fmt.Printf("\t%q,\n", s)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
fmt.Printf("var hashes = [...]uint32{\n")
|
||||
for _, s := range byInt {
|
||||
fmt.Printf("\t0x%08x,\n", hash(s))
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
fmt.Printf("var loHi = [maxLen + 2]uint16{\n")
|
||||
for n := 0; n <= maxLen; n++ {
|
||||
fmt.Printf("\t%d,\n", sort.Search(len(byInt), func(i int) bool {
|
||||
return int(lhashes[i]>>32) >= n
|
||||
}))
|
||||
}
|
||||
fmt.Printf("\t%d,\n", len(byInt))
|
||||
fmt.Printf("}\n\n")
|
||||
fmt.Printf("var oneByteAtoms = [26]Atom{\n")
|
||||
for i := 'a'; i <= 'z'; i++ {
|
||||
val := "0"
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user