1
0
mirror of https://github.com/golang/go synced 2024-11-25 08:47:56 -07:00

exp/html/atom: new package.

50% fewer mallocs in HTML tokenization, resulting in 25% fewer mallocs
in parsing go1.html.

Making the parser use integer comparisons instead of string comparisons
will be a follow-up CL, to be co-ordinated with Andy Balholm's work.

exp/html benchmarks before/after:

BenchmarkParser	     500	   4754294 ns/op	  16.44 MB/s
        parse_test.go:409: 500 iterations, 14651 mallocs per iteration
BenchmarkRawLevelTokenizer	    2000	    903481 ns/op	  86.51 MB/s
        token_test.go:678: 2000 iterations, 28 mallocs per iteration
BenchmarkLowLevelTokenizer	    2000	   1260485 ns/op	  62.01 MB/s
        token_test.go:678: 2000 iterations, 41 mallocs per iteration
BenchmarkHighLevelTokenizer	    1000	   2165964 ns/op	  36.09 MB/s
        token_test.go:678: 1000 iterations, 6616 mallocs per iteration

BenchmarkParser	     500	   4664912 ns/op	  16.76 MB/s
        parse_test.go:409: 500 iterations, 11266 mallocs per iteration
BenchmarkRawLevelTokenizer	    2000	    903065 ns/op	  86.55 MB/s
        token_test.go:678: 2000 iterations, 28 mallocs per iteration
BenchmarkLowLevelTokenizer	    2000	   1260032 ns/op	  62.03 MB/s
        token_test.go:678: 2000 iterations, 41 mallocs per iteration
BenchmarkHighLevelTokenizer	    1000	   2143356 ns/op	  36.47 MB/s
        token_test.go:678: 1000 iterations, 3231 mallocs per iteration

R=r, rsc, rogpeppe
CC=andybalholm, golang-dev
https://golang.org/cl/6255062
This commit is contained in:
Nigel Tao 2012-05-31 15:37:18 +10:00
parent 43cf5505fc
commit bb4a817a92
5 changed files with 1178 additions and 3 deletions

View File

@ -0,0 +1,88 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package atom provides integer codes (also known as atoms) for a fixed set of
// frequently occurring HTML strings: lower-case tag names and attribute keys
// such as "p" and "id".
//
// Sharing an atom's string representation between all elements with the same
// tag can result in fewer string allocations when tokenizing and parsing HTML.
// Integer comparisons are also generally faster than string comparisons.
//
// An atom's particular code (such as atom.Div == 63) is not guaranteed to
// stay the same between versions of this package. Neither is any ordering
// guaranteed: whether atom.H1 < atom.H2 may also change. The codes are not
// guaranteed to be dense. The only guarantees are that e.g. looking up "div"
// will yield atom.Div, calling atom.Div.String will return "div", and
// atom.Div != 0.
package atom
// Atom is an integer code for a string. The zero value maps to "".
type Atom int
// String returns the atom's string representation.
func (a Atom) String() string {
if a <= 0 || a > max {
return ""
}
return table[a]
}
// Lookup returns the atom whose name is s. It returns zero if there is no
// such atom.
func Lookup(s []byte) Atom {
if len(s) == 0 {
return 0
}
if len(s) == 1 {
x := s[0]
if x < 'a' || x > 'z' {
return 0
}
return oneByteAtoms[x-'a']
}
// Binary search for the atom. Unlike sort.Search, this returns early on an exact match.
// TODO: this could be optimized further. For example, lo and hi could be initialized
// from s[0]. Separately, all the "onxxx" atoms could be moved into their own table.
lo, hi := Atom(1), 1+max
for lo < hi {
mid := (lo + hi) / 2
if cmp := compare(s, table[mid]); cmp == 0 {
return mid
} else if cmp > 0 {
lo = mid + 1
} else {
hi = mid
}
}
return 0
}
// String returns a string whose contents are equal to s. In that sense, it is
// equivalent to string(s), but may be more efficient.
func String(s []byte) string {
if a := Lookup(s); a != 0 {
return a.String()
}
return string(s)
}
// compare is like bytes.Compare, except that it takes one []byte argument and
// one string argument, and returns negative/0/positive instead of -1/0/+1.
func compare(s []byte, t string) int {
n := len(s)
if n > len(t) {
n = len(t)
}
for i, si := range s[:n] {
ti := t[i]
switch {
case si > ti:
return +1
case si < ti:
return -1
}
}
return len(s) - len(t)
}

View File

@ -0,0 +1,52 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package atom
import (
"testing"
)
func TestHits(t *testing.T) {
for i, s := range table {
got := Lookup([]byte(s))
if got != Atom(i) {
t.Errorf("Lookup(%q): got %d, want %d", s, got, i)
}
}
}
func TestMisses(t *testing.T) {
testCases := []string{
"",
"\x00",
"\xff",
"A",
"DIV",
"Div",
"dIV",
"aa",
"a\x00",
"ab",
"abb",
"abbr0",
"abbr ",
" abbr",
" a",
"acceptcharset",
"acceptCharset",
"accept_charset",
"h0",
"h1h2",
"h7",
"onClick",
"λ",
}
for _, tc := range testCases {
got := Lookup([]byte(tc))
if got != 0 {
t.Errorf("Lookup(%q): got %d, want 0", tc, got)
}
}
}

View File

@ -0,0 +1,405 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates table.go
// Invoke as
//
// go run gen.go |gofmt >table.go
import (
"fmt"
"sort"
)
// identifier converts s to a Go exported identifier.
// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
func identifier(s string) string {
b := make([]byte, 0, len(s))
cap := true
for _, c := range s {
if c == '-' {
cap = true
continue
}
if cap && 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
cap = false
b = append(b, byte(c))
}
return string(b)
}
func main() {
m := map[string]bool{
"": true,
}
for _, list := range [][]string{elements, attributes, eventHandlers, extra} {
for _, s := range list {
m[s] = true
}
}
atoms := make([]string, 0, len(m))
for s := range m {
atoms = append(atoms, s)
}
sort.Strings(atoms)
byInt := []string{}
byStr := map[string]int{}
ident := []string{}
for i, s := range atoms {
byInt = append(byInt, s)
byStr[s] = i
ident = append(ident, identifier(s))
}
fmt.Printf("package atom\n\nconst (\n")
for i, _ := range byInt {
if i == 0 {
continue
}
fmt.Printf("\t%s Atom = %d\n", ident[i], i)
}
fmt.Printf(")\n\n")
fmt.Printf("const max Atom = %d\n\n", len(byInt)-1)
fmt.Printf("var table = []string{\n")
for _, s := range byInt {
fmt.Printf("\t%q,\n", s)
}
fmt.Printf("}\n\n")
fmt.Printf("var oneByteAtoms = [26]Atom{\n")
for i := 'a'; i <= 'z'; i++ {
val := "0"
if x := byStr[string(i)]; x != 0 {
val = ident[x]
}
fmt.Printf("\t%s,\n", val)
}
fmt.Printf("}\n\n")
}
// The lists of element names and attribute keys were taken from
// http://www.whatwg.org/specs/web-apps/current-work/multipage/section-index.html
// as of the "HTML Living Standard - Last Updated 30 May 2012" version.
var elements = []string{
"a",
"abbr",
"address",
"area",
"article",
"aside",
"audio",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"cite",
"code",
"col",
"colgroup",
"command",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"div",
"dl",
"dt",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"map",
"mark",
"menu",
"meta",
"meter",
"nav",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"pre",
"progress",
"q",
"rp",
"rt",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"small",
"source",
"span",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"u",
"ul",
"var",
"video",
"wbr",
}
var attributes = []string{
"accept",
"accept-charset",
"accesskey",
"action",
"alt",
"async",
"autocomplete",
"autofocus",
"autoplay",
"border",
"challenge",
"charset",
"checked",
"cite",
"class",
"cols",
"colspan",
"command",
"content",
"contenteditable",
"contextmenu",
"controls",
"coords",
"crossorigin",
"data",
"datetime",
"default",
"defer",
"dir",
"dirname",
"disabled",
"download",
"draggable",
"dropzone",
"enctype",
"for",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"headers",
"height",
"hidden",
"high",
"href",
"hreflang",
"http-equiv",
"icon",
"id",
"inert",
"ismap",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"keytype",
"kind",
"label",
"lang",
"list",
"loop",
"low",
"manifest",
"max",
"maxlength",
"media",
"mediagroup",
"method",
"min",
"multiple",
"muted",
"name",
"novalidate",
"open",
"optimum",
"pattern",
"ping",
"placeholder",
"poster",
"preload",
"radiogroup",
"readonly",
"rel",
"required",
"reversed",
"rows",
"rowspan",
"sandbox",
"spellcheck",
"scope",
"scoped",
"seamless",
"selected",
"shape",
"size",
"sizes",
"span",
"src",
"srcdoc",
"srclang",
"start",
"step",
"style",
"tabindex",
"target",
"title",
"translate",
"type",
"typemustmatch",
"usemap",
"value",
"width",
"wrap",
}
var eventHandlers = []string{
"onabort",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onblur",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmessage",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpause",
"onplay",
"onplaying",
"onpopstate",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onstalled",
"onstorage",
"onsubmit",
"onsuspend",
"ontimeupdate",
"onunload",
"onvolumechange",
"onwaiting",
}
// extra are ad-hoc values not covered by any of the lists above.
var extra = []string{
"align",
"annotation",
"applet",
"center",
"color",
"font",
"frame",
"frameset",
"nobr",
}

View File

@ -0,0 +1,629 @@
package atom
const (
A Atom = 1
Abbr Atom = 2
Accept Atom = 3
AcceptCharset Atom = 4
Accesskey Atom = 5
Action Atom = 6
Address Atom = 7
Align Atom = 8
Alt Atom = 9
Annotation Atom = 10
Applet Atom = 11
Area Atom = 12
Article Atom = 13
Aside Atom = 14
Async Atom = 15
Audio Atom = 16
Autocomplete Atom = 17
Autofocus Atom = 18
Autoplay Atom = 19
B Atom = 20
Base Atom = 21
Bdi Atom = 22
Bdo Atom = 23
Blockquote Atom = 24
Body Atom = 25
Border Atom = 26
Br Atom = 27
Button Atom = 28
Canvas Atom = 29
Caption Atom = 30
Center Atom = 31
Challenge Atom = 32
Charset Atom = 33
Checked Atom = 34
Cite Atom = 35
Class Atom = 36
Code Atom = 37
Col Atom = 38
Colgroup Atom = 39
Color Atom = 40
Cols Atom = 41
Colspan Atom = 42
Command Atom = 43
Content Atom = 44
Contenteditable Atom = 45
Contextmenu Atom = 46
Controls Atom = 47
Coords Atom = 48
Crossorigin Atom = 49
Data Atom = 50
Datalist Atom = 51
Datetime Atom = 52
Dd Atom = 53
Default Atom = 54
Defer Atom = 55
Del Atom = 56
Details Atom = 57
Dfn Atom = 58
Dialog Atom = 59
Dir Atom = 60
Dirname Atom = 61
Disabled Atom = 62
Div Atom = 63
Dl Atom = 64
Download Atom = 65
Draggable Atom = 66
Dropzone Atom = 67
Dt Atom = 68
Em Atom = 69
Embed Atom = 70
Enctype Atom = 71
Fieldset Atom = 72
Figcaption Atom = 73
Figure Atom = 74
Font Atom = 75
Footer Atom = 76
For Atom = 77
Form Atom = 78
Formaction Atom = 79
Formenctype Atom = 80
Formmethod Atom = 81
Formnovalidate Atom = 82
Formtarget Atom = 83
Frame Atom = 84
Frameset Atom = 85
H1 Atom = 86
H2 Atom = 87
H3 Atom = 88
H4 Atom = 89
H5 Atom = 90
H6 Atom = 91
Head Atom = 92
Header Atom = 93
Headers Atom = 94
Height Atom = 95
Hgroup Atom = 96
Hidden Atom = 97
High Atom = 98
Hr Atom = 99
Href Atom = 100
Hreflang Atom = 101
Html Atom = 102
HttpEquiv Atom = 103
I Atom = 104
Icon Atom = 105
Id Atom = 106
Iframe Atom = 107
Img Atom = 108
Inert Atom = 109
Input Atom = 110
Ins Atom = 111
Ismap Atom = 112
Itemid Atom = 113
Itemprop Atom = 114
Itemref Atom = 115
Itemscope Atom = 116
Itemtype Atom = 117
Kbd Atom = 118
Keygen Atom = 119
Keytype Atom = 120
Kind Atom = 121
Label Atom = 122
Lang Atom = 123
Legend Atom = 124
Li Atom = 125
Link Atom = 126
List Atom = 127
Loop Atom = 128
Low Atom = 129
Manifest Atom = 130
Map Atom = 131
Mark Atom = 132
Max Atom = 133
Maxlength Atom = 134
Media Atom = 135
Mediagroup Atom = 136
Menu Atom = 137
Meta Atom = 138
Meter Atom = 139
Method Atom = 140
Min Atom = 141
Multiple Atom = 142
Muted Atom = 143
Name Atom = 144
Nav Atom = 145
Nobr Atom = 146
Noscript Atom = 147
Novalidate Atom = 148
Object Atom = 149
Ol Atom = 150
Onabort Atom = 151
Onafterprint Atom = 152
Onbeforeprint Atom = 153
Onbeforeunload Atom = 154
Onblur Atom = 155
Oncancel Atom = 156
Oncanplay Atom = 157
Oncanplaythrough Atom = 158
Onchange Atom = 159
Onclick Atom = 160
Onclose Atom = 161
Oncontextmenu Atom = 162
Oncuechange Atom = 163
Ondblclick Atom = 164
Ondrag Atom = 165
Ondragend Atom = 166
Ondragenter Atom = 167
Ondragleave Atom = 168
Ondragover Atom = 169
Ondragstart Atom = 170
Ondrop Atom = 171
Ondurationchange Atom = 172
Onemptied Atom = 173
Onended Atom = 174
Onerror Atom = 175
Onfocus Atom = 176
Onhashchange Atom = 177
Oninput Atom = 178
Oninvalid Atom = 179
Onkeydown Atom = 180
Onkeypress Atom = 181
Onkeyup Atom = 182
Onload Atom = 183
Onloadeddata Atom = 184
Onloadedmetadata Atom = 185
Onloadstart Atom = 186
Onmessage Atom = 187
Onmousedown Atom = 188
Onmousemove Atom = 189
Onmouseout Atom = 190
Onmouseover Atom = 191
Onmouseup Atom = 192
Onmousewheel Atom = 193
Onoffline Atom = 194
Ononline Atom = 195
Onpagehide Atom = 196
Onpageshow Atom = 197
Onpause Atom = 198
Onplay Atom = 199
Onplaying Atom = 200
Onpopstate Atom = 201
Onprogress Atom = 202
Onratechange Atom = 203
Onreset Atom = 204
Onresize Atom = 205
Onscroll Atom = 206
Onseeked Atom = 207
Onseeking Atom = 208
Onselect Atom = 209
Onshow Atom = 210
Onstalled Atom = 211
Onstorage Atom = 212
Onsubmit Atom = 213
Onsuspend Atom = 214
Ontimeupdate Atom = 215
Onunload Atom = 216
Onvolumechange Atom = 217
Onwaiting Atom = 218
Open Atom = 219
Optgroup Atom = 220
Optimum Atom = 221
Option Atom = 222
Output Atom = 223
P Atom = 224
Param Atom = 225
Pattern Atom = 226
Ping Atom = 227
Placeholder Atom = 228
Poster Atom = 229
Pre Atom = 230
Preload Atom = 231
Progress Atom = 232
Q Atom = 233
Radiogroup Atom = 234
Readonly Atom = 235
Rel Atom = 236
Required Atom = 237
Reversed Atom = 238
Rows Atom = 239
Rowspan Atom = 240
Rp Atom = 241
Rt Atom = 242
Ruby Atom = 243
S Atom = 244
Samp Atom = 245
Sandbox Atom = 246
Scope Atom = 247
Scoped Atom = 248
Script Atom = 249
Seamless Atom = 250
Section Atom = 251
Select Atom = 252
Selected Atom = 253
Shape Atom = 254
Size Atom = 255
Sizes Atom = 256
Small Atom = 257
Source Atom = 258
Span Atom = 259
Spellcheck Atom = 260
Src Atom = 261
Srcdoc Atom = 262
Srclang Atom = 263
Start Atom = 264
Step Atom = 265
Strong Atom = 266
Style Atom = 267
Sub Atom = 268
Summary Atom = 269
Sup Atom = 270
Tabindex Atom = 271
Table Atom = 272
Target Atom = 273
Tbody Atom = 274
Td Atom = 275
Textarea Atom = 276
Tfoot Atom = 277
Th Atom = 278
Thead Atom = 279
Time Atom = 280
Title Atom = 281
Tr Atom = 282
Track Atom = 283
Translate Atom = 284
Type Atom = 285
Typemustmatch Atom = 286
U Atom = 287
Ul Atom = 288
Usemap Atom = 289
Value Atom = 290
Var Atom = 291
Video Atom = 292
Wbr Atom = 293
Width Atom = 294
Wrap Atom = 295
)
const max Atom = 295
var table = []string{
"",
"a",
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"address",
"align",
"alt",
"annotation",
"applet",
"area",
"article",
"aside",
"async",
"audio",
"autocomplete",
"autofocus",
"autoplay",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"border",
"br",
"button",
"canvas",
"caption",
"center",
"challenge",
"charset",
"checked",
"cite",
"class",
"code",
"col",
"colgroup",
"color",
"cols",
"colspan",
"command",
"content",
"contenteditable",
"contextmenu",
"controls",
"coords",
"crossorigin",
"data",
"datalist",
"datetime",
"dd",
"default",
"defer",
"del",
"details",
"dfn",
"dialog",
"dir",
"dirname",
"disabled",
"div",
"dl",
"download",
"draggable",
"dropzone",
"dt",
"em",
"embed",
"enctype",
"fieldset",
"figcaption",
"figure",
"font",
"footer",
"for",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"headers",
"height",
"hgroup",
"hidden",
"high",
"hr",
"href",
"hreflang",
"html",
"http-equiv",
"i",
"icon",
"id",
"iframe",
"img",
"inert",
"input",
"ins",
"ismap",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"kbd",
"keygen",
"keytype",
"kind",
"label",
"lang",
"legend",
"li",
"link",
"list",
"loop",
"low",
"manifest",
"map",
"mark",
"max",
"maxlength",
"media",
"mediagroup",
"menu",
"meta",
"meter",
"method",
"min",
"multiple",
"muted",
"name",
"nav",
"nobr",
"noscript",
"novalidate",
"object",
"ol",
"onabort",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onblur",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmessage",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpause",
"onplay",
"onplaying",
"onpopstate",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onstalled",
"onstorage",
"onsubmit",
"onsuspend",
"ontimeupdate",
"onunload",
"onvolumechange",
"onwaiting",
"open",
"optgroup",
"optimum",
"option",
"output",
"p",
"param",
"pattern",
"ping",
"placeholder",
"poster",
"pre",
"preload",
"progress",
"q",
"radiogroup",
"readonly",
"rel",
"required",
"reversed",
"rows",
"rowspan",
"rp",
"rt",
"ruby",
"s",
"samp",
"sandbox",
"scope",
"scoped",
"script",
"seamless",
"section",
"select",
"selected",
"shape",
"size",
"sizes",
"small",
"source",
"span",
"spellcheck",
"src",
"srcdoc",
"srclang",
"start",
"step",
"strong",
"style",
"sub",
"summary",
"sup",
"tabindex",
"table",
"target",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"translate",
"type",
"typemustmatch",
"u",
"ul",
"usemap",
"value",
"var",
"video",
"wbr",
"width",
"wrap",
}
var oneByteAtoms = [26]Atom{
A,
B,
0,
0,
0,
0,
0,
0,
I,
0,
0,
0,
0,
0,
0,
P,
Q,
0,
S,
0,
U,
0,
0,
0,
0,
0,
}

View File

@ -6,6 +6,7 @@ package html
import (
"bytes"
"exp/html/atom"
"io"
"strconv"
"strings"
@ -791,13 +792,13 @@ func (z *Tokenizer) Token() Token {
for moreAttr {
var key, val []byte
key, val, moreAttr = z.TagAttr()
attr = append(attr, Attribute{"", string(key), string(val)})
attr = append(attr, Attribute{"", atom.String(key), string(val)})
}
t.Data = string(name)
t.Data = atom.String(name)
t.Attr = attr
case EndTagToken:
name, _ := z.TagName()
t.Data = string(name)
t.Data = atom.String(name)
}
return t
}