exp/html/atom: faster Lookup with smaller tables

Use perfect cuckoo hash, to avoid binary search. Define Atom bits as offset+len in long string instead of enumeration, to avoid string headers. Before: 1909 string bytes + 6060 tables = 7969 total data After: 1406 string bytes + 2048 tables = 3454 total data benchmark old ns/op new ns/op delta BenchmarkLookup 83878 64681 -22.89% R=nigeltao, r CC=golang-dev https://golang.org/cl/6262051
2024-11-22 04:24:39 -07:00 · 2012-06-02 22:43:11 -04:00 · 2012-06-02 22:43:11 -04:00 · 192550592a
commit 192550592a
parent 911f802b37
5 changed files with 1216 additions and 1081 deletions
--- a/src/pkg/exp/html/atom/atom.go
+++ b/src/pkg/exp/html/atom/atom.go
@ -15,69 +15,64 @@
 // whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
 // be dense. The only guarantees are that e.g. looking up "div" will yield
 // atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
+//
+// TODO(rsc): When this package moves out of exp we need to freeze atom values
+// across releases.
 package atom

-// The hash function must be the same as the one used in gen.go
-func hash(s []byte) (h uint32) {
-	for i := 0; i < len(s); i++ {
-		h = h<<5 ^ h>>27 ^ uint32(s[i])
+// Atom is an integer code for a string. The zero value maps to "".
+type Atom uint32
+
+// String returns the atom's name.
+func (a Atom) String() string {
+	start := uint32(a >> 8)
+	n := uint32(a & 0xff)
+	if start+n > uint32(len(atomText)) {
+		return ""
+	}
+	return atomText[start : start+n]
+}
+
+func (a Atom) string() string {
+	return atomText[a>>8 : a>>8+a&0xff]
+}
+
+// fnv computes the FNV hash with an arbitrary starting value h.
+func fnv(h uint32, s []byte) uint32 {
+	for i := range s {
+		h ^= uint32(s[i])
+		h *= 16777619
 	}
 	return h
 }

-// Atom is an integer code for a string. The zero value maps to "".
-type Atom int
-
-// String returns the atom's name.
-func (a Atom) String() string {
-	if 0 <= a && a < Atom(len(table)) {
-		return table[a]
+func match(s string, t []byte) bool {
+	for i, c := range t {
+		if s[i] != c {
+			return false
+		}
 	}
-	return ""
+	return true
 }

 // Lookup returns the atom whose name is s. It returns zero if there is no
 // such atom.
 func Lookup(s []byte) Atom {
-	if len(s) == 0 || len(s) > maxLen {
+	if len(s) == 0 || len(s) > maxAtomLen {
 		return 0
 	}
-	if len(s) == 1 {
-		x := s[0]
-		if x < 'a' || x > 'z' {
-			return 0
-		}
-		return oneByteAtoms[x-'a']
+	h := fnv(hash0, s)
+	if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
+		return a
 	}
-	hs := hash(s)
-	// Binary search for hs. Unlike sort.Search, this returns early on an exact match.
-	// A loop invariant is that len(table[i]) == len(s) for all i in [lo, hi).
-	lo := Atom(loHi[len(s)])
-	hi := Atom(loHi[len(s)+1])
-	for lo < hi {
-		mid := (lo + hi) / 2
-		if ht := hashes[mid]; hs == ht {
-			// The gen.go program ensures that each atom's name has a distinct hash.
-			// However, arbitrary strings may collide with the atom's name. We have
-			// to check that string(s) == table[mid].
-			t := table[mid]
-			for i, si := range s {
-				if si != t[i] {
-					return 0
-				}
-			}
-			return mid
-		} else if hs > ht {
-			lo = mid + 1
-		} else {
-			hi = mid
-		}
+	if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
+		return a
 	}
 	return 0
 }

 // String returns a string whose contents are equal to s. In that sense, it is
-// equivalent to string(s), but may be more efficient.
+// equivalent to string(s) but may be more efficient.
 func String(s []byte) string {
 	if a := Lookup(s); a != 0 {
 		return a.String()
--- a/src/pkg/exp/html/atom/atom_test.go
+++ b/src/pkg/exp/html/atom/atom_test.go
@ -9,11 +9,22 @@ import (
 	"testing"
 )

+func TestKnown(t *testing.T) {
+	for _, s := range testAtomList {
+		if atom := Lookup([]byte(s)); atom.String() != s {
+			t.Errorf("Lookup(%q) = %#x (%q)", s, uint32(atom), atom.String())
+		}
+	}
+}
+
 func TestHits(t *testing.T) {
-	for i, s := range table {
-		got := Lookup([]byte(s))
-		if got != Atom(i) {
-			t.Errorf("Lookup(%q): got %d, want %d", s, got, i)
+	for _, a := range table {
+		if a == 0 {
+			continue
+		}
+		got := Lookup([]byte(a.String()))
+		if got != a {
+			t.Errorf("Lookup(%q) = %#x, want %#x", a.String(), uint32(got), uint32(a))
 		}
 	}
 }
@ -55,8 +66,12 @@ func TestMisses(t *testing.T) {
 }

 func BenchmarkLookup(b *testing.B) {
-	sortedTable := make([]string, len(table))
-	copy(sortedTable, table[:])
+	sortedTable := make([]string, 0, len(table))
+	for _, a := range table {
+		if a != 0 {
+			sortedTable = append(sortedTable, a.String())
+		}
+	}
 	sort.Strings(sortedTable)

 	x := make([][]byte, 1000)
--- a/src/pkg/exp/html/atom/gen.go
+++ b/src/pkg/exp/html/atom/gen.go
@ -6,37 +6,21 @@

 package main

-// This program generates table.go
+// This program generates table.go and table_test.go.
 // Invoke as
 //
 //	go run gen.go |gofmt >table.go
+//	go run gen.go -test |gofmt >table_test.go

 import (
+	"flag"
 	"fmt"
+	"math/rand"
 	"os"
 	"sort"
+	"strings"
 )

-// The hash function must be the same as the one used in atom.go
-func hash(s string) (h uint32) {
-	for i := 0; i < len(s); i++ {
-		h = h<<5 ^ h>>27 ^ uint32(s[i])
-	}
-	return h
-}
-
-// lhash returns a uint64 whose high 32 bits are len(s) and whose low 32 bits
-// are hash(s).
-func lhash(s string) uint64 {
-	return uint64(len(s))<<32 | uint64(hash(s))
-}
-
-type byLhash []string
-
-func (b byLhash) Len() int           { return len(b) }
-func (b byLhash) Less(i, j int) bool { return lhash(b[i]) < lhash(b[j]) }
-func (b byLhash) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
-
 // identifier converts s to a Go exported identifier.
 // It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
 func identifier(s string) string {
@ -56,94 +40,247 @@ func identifier(s string) string {
 	return string(b)
 }

+var test = flag.Bool("test", false, "generate table_test.go")
+
 func main() {
-	// Construct a list of atoms, sorted by their lhash.
-	m0 := map[string]bool{
-		"": true,
-	}
-	for _, list := range [][]string{elements, attributes, eventHandlers, extra} {
-		for _, s := range list {
-			m0[s] = true
-		}
-	}
-	atoms := make([]string, 0, len(m0))
-	for s := range m0 {
-		atoms = append(atoms, s)
-	}
-	sort.Sort(byLhash(atoms))
+	flag.Parse()

-	// Calculate the magic constants to output as table.go.
-	byInt := []string{}
-	byStr := map[string]int{}
-	ident := []string{}
-	lhashes := []uint64{}
+	var all []string
+	all = append(all, elements...)
+	all = append(all, attributes...)
+	all = append(all, eventHandlers...)
+	all = append(all, extra...)
+	sort.Strings(all)
+
+	if *test {
+		fmt.Printf("// generated by go run gen.go -test; DO NOT EDIT\n\n")
+		fmt.Printf("package atom\n\n")
+		fmt.Printf("var testAtomList = []string{\n")
+		for _, s := range all {
+			fmt.Printf("\t%q,\n", s)
+		}
+		fmt.Printf("}\n")
+		return
+	}
+
+	// uniq - lists have dups
+	// compute max len too
 	maxLen := 0
-	for i, s := range atoms {
-		byInt = append(byInt, s)
-		byStr[s] = i
-		ident = append(ident, identifier(s))
-		lhashes = append(lhashes, lhash(s))
-		if maxLen < len(s) {
-			maxLen = len(s)
+	w := 0
+	for _, s := range all {
+		if w == 0 || all[w-1] != s {
+			if maxLen < len(s) {
+				maxLen = len(s)
+			}
+			all[w] = s
+			w++
+		}
+	}
+	all = all[:w]
+
+	// Find hash that minimizes table size.
+	var best *table
+	for i := 0; i < 1000000; i++ {
+		if best != nil && 1<<(best.k-1) < len(all) {
+			break
+		}
+		h := rand.Uint32()
+		for k := uint(0); k <= 16; k++ {
+			if best != nil && k >= best.k {
+				break
+			}
+			var t table
+			if t.init(h, k, all) {
+				best = &t
+				break
+			}
+		}
+	}
+	if best == nil {
+		fmt.Fprintf(os.Stderr, "failed to construct string table\n")
+		os.Exit(1)
+	}
+
+	// Lay out strings, using overlaps when possible.
+	layout := append([]string{}, all...)
+
+	// Remove strings that are substrings of other strings
+	for changed := true; changed; {
+		changed = false
+		for i, s := range layout {
+			if s == "" {
+				continue
+			}
+			for j, t := range layout {
+				if i != j && t != "" && strings.Contains(s, t) {
+					changed = true
+					layout[j] = ""
+				}
+			}
 		}
 	}

-	// Check for hash collisions.
-	m1 := map[uint64]int{}
-	for i, h := range lhashes {
-		h &= 1<<32 - 1
-		if j, ok := m1[h]; ok {
-			fmt.Fprintf(os.Stderr, "hash collision at 0x%08x: %q, %q\n", h, byInt[i], byInt[j])
-			os.Exit(1)
+	// Join strings where one suffix matches another prefix.
+	for {
+		// Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
+		// maximizing overlap length k.
+		besti := -1
+		bestj := -1
+		bestk := 0
+		for i, s := range layout {
+			if s == "" {
+				continue
+			}
+			for j, t := range layout {
+				if i == j {
+					continue
+				}
+				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
+					if s[len(s)-k:] == t[:k] {
+						besti = i
+						bestj = j
+						bestk = k
+					}
+				}
+			}
 		}
-		m1[h] = i
+		if bestk > 0 {
+			layout[besti] += layout[bestj][bestk:]
+			layout[bestj] = ""
+			continue
+		}
+		break
+	}
+
+	text := strings.Join(layout, "")
+
+	atom := map[string]uint32{}
+	for _, s := range all {
+		off := strings.Index(text, s)
+		if off < 0 {
+			panic("lost string " + s)
+		}
+		atom[s] = uint32(off<<8 | len(s))
 	}

 	// Generate the Go code.
+	fmt.Printf("// generated by go run gen.go; DO NOT EDIT\n\n")
 	fmt.Printf("package atom\n\nconst (\n")
-	{
-		// Print the Atoms in alphabetical order.
-		lines := []string{}
-		for i, _ := range byInt {
-			if i == 0 {
-				continue
-			}
-			lines = append(lines, fmt.Sprintf("\t%s Atom = %d", ident[i], i))
+	for _, s := range all {
+		fmt.Printf("\t%s Atom = %#x\n", identifier(s), atom[s])
+	}
+	fmt.Printf(")\n\n")
+
+	fmt.Printf("const hash0 = %#x\n\n", best.h0)
+	fmt.Printf("const maxAtomLen = %d\n\n", maxLen)
+
+	fmt.Printf("var table = [1<<%d]Atom{\n", best.k)
+	for i, s := range best.tab {
+		if s == "" {
+			continue
 		}
-		sort.Strings(lines)
-		for _, line := range lines {
-			fmt.Println(line)
+		fmt.Printf("\t%#x: %#x, // %s\n", i, atom[s], s)
+	}
+	fmt.Printf("}\n")
+	datasize := (1 << best.k) * 4
+
+	fmt.Printf("const atomText =\n")
+	textsize := len(text)
+	for len(text) > 60 {
+		fmt.Printf("\t%q +\n", text[:60])
+		text = text[60:]
+	}
+	fmt.Printf("\t%q\n\n", text)
+
+	fmt.Fprintf(os.Stderr, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
+}
+
+type byLen []string
+
+func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
+func (x byLen) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byLen) Len() int           { return len(x) }
+
+// fnv computes the FNV hash with an arbitrary starting value h.
+func fnv(h uint32, s string) uint32 {
+	for i := 0; i < len(s); i++ {
+		h ^= uint32(s[i])
+		h *= 16777619
+	}
+	return h
+}
+
+// A table represents an attempt at constructing the lookup table.
+// The lookup table uses cuckoo hashing, meaning that each string
+// can be found in one of two positions.
+type table struct {
+	h0   uint32
+	k    uint
+	mask uint32
+	tab  []string
+}
+
+// hash returns the two hashes for s.
+func (t *table) hash(s string) (h1, h2 uint32) {
+	h := fnv(t.h0, s)
+	h1 = h & t.mask
+	h2 = (h >> 16) & t.mask
+	return
+}
+
+// init initializes the table with the given parameters.
+// h0 is the initial hash value,
+// k is the number of bits of hash value to use, and
+// x is the list of strings to store in the table.
+// init returns false if the table cannot be constructed.
+func (t *table) init(h0 uint32, k uint, x []string) bool {
+	t.h0 = h0
+	t.k = k
+	t.tab = make([]string, 1<<k)
+	t.mask = 1<<k - 1
+	for _, s := range x {
+		if !t.insert(s) {
+			return false
 		}
-		fmt.Printf(")\n\n")
 	}
-	fmt.Printf("const maxLen = %d\n\n", maxLen)
-	fmt.Printf("var table = [...]string{\n")
-	for _, s := range byInt {
-		fmt.Printf("\t%q,\n", s)
+	return true
+}
+
+// insert inserts s in the table.
+func (t *table) insert(s string) bool {
+	h1, h2 := t.hash(s)
+	if t.tab[h1] == "" {
+		t.tab[h1] = s
+		return true
 	}
-	fmt.Printf("}\n\n")
-	fmt.Printf("var hashes = [...]uint32{\n")
-	for _, s := range byInt {
-		fmt.Printf("\t0x%08x,\n", hash(s))
+	if t.tab[h2] == "" {
+		t.tab[h2] = s
+		return true
 	}
-	fmt.Printf("}\n\n")
-	fmt.Printf("var loHi = [maxLen + 2]uint16{\n")
-	for n := 0; n <= maxLen; n++ {
-		fmt.Printf("\t%d,\n", sort.Search(len(byInt), func(i int) bool {
-			return int(lhashes[i]>>32) >= n
-		}))
+	if t.push(h1, 0) {
+		t.tab[h1] = s
+		return true
 	}
-	fmt.Printf("\t%d,\n", len(byInt))
-	fmt.Printf("}\n\n")
-	fmt.Printf("var oneByteAtoms = [26]Atom{\n")
-	for i := 'a'; i <= 'z'; i++ {
-		val := "0"
-		if x := byStr[string(i)]; x != 0 {
-			val = ident[x]
-		}
-		fmt.Printf("\t%s,\n", val)
+	if t.push(h2, 0) {
+		t.tab[h2] = s
+		return true
 	}
-	fmt.Printf("}\n\n")
+	return false
+}
+
+// push attempts to push aside the entry in slot i.
+func (t *table) push(i uint32, depth int) bool {
+	if depth > len(t.tab) {
+		return false
+	}
+	s := t.tab[i]
+	h1, h2 := t.hash(s)
+	j := h1 + h2 - i
+	if t.tab[j] != "" && !t.push(j, depth+1) {
+		return false
+	}
+	t.tab[j] = s
+	return true
 }

 // The lists of element names and attribute keys were taken from
--- a/src/pkg/exp/html/atom/table.go
+++ b/src/pkg/exp/html/atom/table.go
--- a/src/pkg/exp/html/atom/table_test.go
+++ b/src/pkg/exp/html/atom/table_test.go
@ -0,0 +1,309 @@
+// generated by go run gen.go -test; DO NOT EDIT
+
+package atom
+
+var testAtomList = []string{
+	"a",
+	"abbr",
+	"accept",
+	"accept-charset",
+	"accesskey",
+	"action",
+	"address",
+	"align",
+	"alt",
+	"annotation",
+	"applet",
+	"area",
+	"article",
+	"aside",
+	"async",
+	"audio",
+	"autocomplete",
+	"autofocus",
+	"autoplay",
+	"b",
+	"base",
+	"bdi",
+	"bdo",
+	"blockquote",
+	"body",
+	"border",
+	"br",
+	"button",
+	"canvas",
+	"caption",
+	"center",
+	"challenge",
+	"charset",
+	"checked",
+	"cite",
+	"cite",
+	"class",
+	"code",
+	"col",
+	"colgroup",
+	"color",
+	"cols",
+	"colspan",
+	"command",
+	"command",
+	"content",
+	"contenteditable",
+	"contextmenu",
+	"controls",
+	"coords",
+	"crossorigin",
+	"data",
+	"data",
+	"datalist",
+	"datetime",
+	"dd",
+	"default",
+	"defer",
+	"del",
+	"details",
+	"dfn",
+	"dialog",
+	"dir",
+	"dirname",
+	"disabled",
+	"div",
+	"dl",
+	"download",
+	"draggable",
+	"dropzone",
+	"dt",
+	"em",
+	"embed",
+	"enctype",
+	"fieldset",
+	"figcaption",
+	"figure",
+	"font",
+	"footer",
+	"for",
+	"form",
+	"form",
+	"formaction",
+	"formenctype",
+	"formmethod",
+	"formnovalidate",
+	"formtarget",
+	"frame",
+	"frameset",
+	"h1",
+	"h2",
+	"h3",
+	"h4",
+	"h5",
+	"h6",
+	"head",
+	"header",
+	"headers",
+	"height",
+	"hgroup",
+	"hidden",
+	"high",
+	"hr",
+	"href",
+	"hreflang",
+	"html",
+	"http-equiv",
+	"i",
+	"icon",
+	"id",
+	"iframe",
+	"img",
+	"inert",
+	"input",
+	"ins",
+	"ismap",
+	"itemid",
+	"itemprop",
+	"itemref",
+	"itemscope",
+	"itemtype",
+	"kbd",
+	"keygen",
+	"keytype",
+	"kind",
+	"label",
+	"label",
+	"lang",
+	"legend",
+	"li",
+	"link",
+	"list",
+	"loop",
+	"low",
+	"manifest",
+	"map",
+	"mark",
+	"max",
+	"maxlength",
+	"media",
+	"mediagroup",
+	"menu",
+	"meta",
+	"meter",
+	"method",
+	"min",
+	"multiple",
+	"muted",
+	"name",
+	"nav",
+	"nobr",
+	"noscript",
+	"novalidate",
+	"object",
+	"ol",
+	"onabort",
+	"onafterprint",
+	"onbeforeprint",
+	"onbeforeunload",
+	"onblur",
+	"oncancel",
+	"oncanplay",
+	"oncanplaythrough",
+	"onchange",
+	"onclick",
+	"onclose",
+	"oncontextmenu",
+	"oncuechange",
+	"ondblclick",
+	"ondrag",
+	"ondragend",
+	"ondragenter",
+	"ondragleave",
+	"ondragover",
+	"ondragstart",
+	"ondrop",
+	"ondurationchange",
+	"onemptied",
+	"onended",
+	"onerror",
+	"onfocus",
+	"onhashchange",
+	"oninput",
+	"oninvalid",
+	"onkeydown",
+	"onkeypress",
+	"onkeyup",
+	"onload",
+	"onloadeddata",
+	"onloadedmetadata",
+	"onloadstart",
+	"onmessage",
+	"onmousedown",
+	"onmousemove",
+	"onmouseout",
+	"onmouseover",
+	"onmouseup",
+	"onmousewheel",
+	"onoffline",
+	"ononline",
+	"onpagehide",
+	"onpageshow",
+	"onpause",
+	"onplay",
+	"onplaying",
+	"onpopstate",
+	"onprogress",
+	"onratechange",
+	"onreset",
+	"onresize",
+	"onscroll",
+	"onseeked",
+	"onseeking",
+	"onselect",
+	"onshow",
+	"onstalled",
+	"onstorage",
+	"onsubmit",
+	"onsuspend",
+	"ontimeupdate",
+	"onunload",
+	"onvolumechange",
+	"onwaiting",
+	"open",
+	"optgroup",
+	"optimum",
+	"option",
+	"output",
+	"p",
+	"param",
+	"pattern",
+	"ping",
+	"placeholder",
+	"poster",
+	"pre",
+	"preload",
+	"progress",
+	"q",
+	"radiogroup",
+	"readonly",
+	"rel",
+	"required",
+	"reversed",
+	"rows",
+	"rowspan",
+	"rp",
+	"rt",
+	"ruby",
+	"s",
+	"samp",
+	"sandbox",
+	"scope",
+	"scoped",
+	"script",
+	"seamless",
+	"section",
+	"select",
+	"selected",
+	"shape",
+	"size",
+	"sizes",
+	"small",
+	"source",
+	"span",
+	"span",
+	"spellcheck",
+	"src",
+	"srcdoc",
+	"srclang",
+	"start",
+	"step",
+	"strong",
+	"style",
+	"style",
+	"sub",
+	"summary",
+	"sup",
+	"tabindex",
+	"table",
+	"target",
+	"tbody",
+	"td",
+	"textarea",
+	"tfoot",
+	"th",
+	"thead",
+	"time",
+	"title",
+	"title",
+	"tr",
+	"track",
+	"translate",
+	"type",
+	"typemustmatch",
+	"u",
+	"ul",
+	"usemap",
+	"value",
+	"var",
+	"video",
+	"wbr",
+	"width",
+	"wrap",
+}