mirror of
https://github.com/golang/go
synced 2024-11-21 20:44:39 -07:00
strings: implement a faster generic Replacer
This also fixes the semantics of some corner cases with the empty match. TODOs for genericReplacer in the tests are fixed. benchmark old ns/op new ns/op delta BenchmarkGenericNoMatch 71395 3132 -95.61% BenchmarkGenericMatch1 75610 20280 -73.18% BenchmarkGenericMatch2 837995 86725 -89.65% R=nigeltao, rsc CC=golang-dev https://golang.org/cl/6492076
This commit is contained in:
parent
a1a414e6f1
commit
0aad3cdc59
@ -7,3 +7,30 @@ package strings
|
||||
func (r *Replacer) Replacer() interface{} {
|
||||
return r.r
|
||||
}
|
||||
|
||||
func (r *Replacer) PrintTrie() string {
|
||||
gen := r.r.(*genericReplacer)
|
||||
return gen.printNode(&gen.root, 0)
|
||||
}
|
||||
|
||||
func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
|
||||
if t.priority > 0 {
|
||||
s += "+"
|
||||
} else {
|
||||
s += "-"
|
||||
}
|
||||
s += "\n"
|
||||
|
||||
if t.prefix != "" {
|
||||
s += Repeat(".", depth) + t.prefix
|
||||
s += r.printNode(t.next, depth+len(t.prefix))
|
||||
} else if t.table != nil {
|
||||
for b, m := range r.mapping {
|
||||
if int(m) != r.tableSize && t.table[m] != nil {
|
||||
s += Repeat(".", depth) + string([]byte{byte(b)})
|
||||
s += r.printNode(t.table[m], depth+1)
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -80,89 +80,264 @@ func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
|
||||
return r.r.WriteString(w, s)
|
||||
}
|
||||
|
||||
// genericReplacer is the fully generic (and least optimized) algorithm.
|
||||
// It's used as a fallback when nothing faster can be used.
|
||||
type genericReplacer struct {
|
||||
p []pair
|
||||
// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
|
||||
// and values may be empty. For example, the trie containing keys "ax", "ay",
|
||||
// "bcbc", "x" and "xy" could have eight nodes:
|
||||
//
|
||||
// n0 -
|
||||
// n1 a-
|
||||
// n2 .x+
|
||||
// n3 .y+
|
||||
// n4 b-
|
||||
// n5 .cbc+
|
||||
// n6 x+
|
||||
// n7 .y+
|
||||
//
|
||||
// n0 is the root node, and its children are n1, n4 and n6; n1's children are
|
||||
// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
|
||||
// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
|
||||
// (marked with a trailing "+") are complete keys.
|
||||
type trieNode struct {
|
||||
// value is the value of the trie node's key/value pair. It is empty if
|
||||
// this node is not a complete key.
|
||||
value string
|
||||
// priority is the priority (higher is more important) of the trie node's
|
||||
// key/value pair; keys are not necessarily matched shortest- or longest-
|
||||
// first. Priority is positive if this node is a complete key, and zero
|
||||
// otherwise. In the example above, positive/zero priorities are marked
|
||||
// with a trailing "+" or "-".
|
||||
priority int
|
||||
|
||||
// A trie node may have zero, one or more child nodes:
|
||||
// * if the remaining fields are zero, there are no children.
|
||||
// * if prefix and next are non-zero, there is one child in next.
|
||||
// * if table is non-zero, it defines all the children.
|
||||
//
|
||||
// Prefixes are preferred over tables when there is one child, but the
|
||||
// root node always uses a table for lookup efficiency.
|
||||
|
||||
// prefix is the difference in keys between this trie node and the next.
|
||||
// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
|
||||
// Node n5 has no children and so has zero prefix, next and table fields.
|
||||
prefix string
|
||||
next *trieNode
|
||||
|
||||
// table is a lookup table indexed by the next byte in the key, after
|
||||
// remapping that byte through genericReplacer.mapping to create a dense
|
||||
// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
|
||||
// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
|
||||
// genericReplacer.tableSize will be 5. Node n0's table will be
|
||||
// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
|
||||
// 'a', 'b' and 'x'.
|
||||
table []*trieNode
|
||||
}
|
||||
|
||||
type pair struct{ old, new string }
|
||||
|
||||
func makeGenericReplacer(oldnew []string) *genericReplacer {
|
||||
gen := &genericReplacer{
|
||||
p: make([]pair, len(oldnew)/2),
|
||||
func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
|
||||
if key == "" {
|
||||
if t.priority == 0 {
|
||||
t.value = val
|
||||
t.priority = priority
|
||||
}
|
||||
return
|
||||
}
|
||||
for i := 0; i < len(oldnew); i += 2 {
|
||||
gen.p[i/2] = pair{oldnew[i], oldnew[i+1]}
|
||||
}
|
||||
return gen
|
||||
}
|
||||
|
||||
type appendSliceWriter struct {
|
||||
b []byte
|
||||
}
|
||||
|
||||
func (w *appendSliceWriter) Write(p []byte) (int, error) {
|
||||
w.b = append(w.b, p...)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (r *genericReplacer) Replace(s string) string {
|
||||
// TODO(bradfitz): optimized version
|
||||
n, _ := r.WriteString(discard, s)
|
||||
w := appendSliceWriter{make([]byte, 0, n)}
|
||||
r.WriteString(&w, s)
|
||||
return string(w.b)
|
||||
}
|
||||
|
||||
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
|
||||
lastEmpty := false // the last replacement was of the empty string
|
||||
Input:
|
||||
// TODO(bradfitz): optimized version
|
||||
for i := 0; i < len(s); {
|
||||
for _, p := range r.p {
|
||||
if p.old == "" && lastEmpty {
|
||||
// Don't let old match twice in a row.
|
||||
// (it doesn't advance the input and
|
||||
// would otherwise loop forever)
|
||||
continue
|
||||
}
|
||||
if HasPrefix(s[i:], p.old) {
|
||||
if p.new != "" {
|
||||
wn, err := w.Write([]byte(p.new))
|
||||
n += wn
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
i += len(p.old)
|
||||
lastEmpty = p.old == ""
|
||||
continue Input
|
||||
if t.prefix != "" {
|
||||
// Need to split the prefix among multiple nodes.
|
||||
var n int // length of the longest common prefix
|
||||
for ; n < len(t.prefix) && n < len(key); n++ {
|
||||
if t.prefix[n] != key[n] {
|
||||
break
|
||||
}
|
||||
}
|
||||
wn, err := w.Write([]byte{s[i]})
|
||||
n += wn
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
// Final empty match at end.
|
||||
for _, p := range r.p {
|
||||
if p.old == "" {
|
||||
if p.new != "" {
|
||||
wn, err := w.Write([]byte(p.new))
|
||||
n += wn
|
||||
if err != nil {
|
||||
return n, err
|
||||
if n == len(t.prefix) {
|
||||
t.next.add(key[n:], val, priority, r)
|
||||
} else if n == 0 {
|
||||
// First byte differs, start a new lookup table here. Looking up
|
||||
// what is currently t.prefix[0] will lead to prefixNode, and
|
||||
// looking up key[0] will lead to keyNode.
|
||||
var prefixNode *trieNode
|
||||
if len(t.prefix) == 1 {
|
||||
prefixNode = t.next
|
||||
} else {
|
||||
prefixNode = &trieNode{
|
||||
prefix: t.prefix[1:],
|
||||
next: t.next,
|
||||
}
|
||||
}
|
||||
keyNode := new(trieNode)
|
||||
t.table = make([]*trieNode, r.tableSize)
|
||||
t.table[r.mapping[t.prefix[0]]] = prefixNode
|
||||
t.table[r.mapping[key[0]]] = keyNode
|
||||
t.prefix = ""
|
||||
t.next = nil
|
||||
keyNode.add(key[1:], val, priority, r)
|
||||
} else {
|
||||
// Insert new node after the common section of the prefix.
|
||||
next := &trieNode{
|
||||
prefix: t.prefix[n:],
|
||||
next: t.next,
|
||||
}
|
||||
t.prefix = t.prefix[:n]
|
||||
t.next = next
|
||||
next.add(key[n:], val, priority, r)
|
||||
}
|
||||
} else if t.table != nil {
|
||||
// Insert into existing table.
|
||||
m := r.mapping[key[0]]
|
||||
if t.table[m] == nil {
|
||||
t.table[m] = new(trieNode)
|
||||
}
|
||||
t.table[m].add(key[1:], val, priority, r)
|
||||
} else {
|
||||
t.prefix = key
|
||||
t.next = new(trieNode)
|
||||
t.next.add("", val, priority, r)
|
||||
}
|
||||
}
|
||||
|
||||
func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
|
||||
// Iterate down the trie to the end, and grab the value and keylen with
|
||||
// the highest priority.
|
||||
bestPriority := 0
|
||||
node := &r.root
|
||||
n := 0
|
||||
for node != nil {
|
||||
if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
|
||||
bestPriority = node.priority
|
||||
val = node.value
|
||||
keylen = n
|
||||
found = true
|
||||
}
|
||||
|
||||
if s == "" {
|
||||
break
|
||||
}
|
||||
if node.table != nil {
|
||||
index := r.mapping[s[0]]
|
||||
if int(index) == r.tableSize {
|
||||
break
|
||||
}
|
||||
node = node.table[index]
|
||||
s = s[1:]
|
||||
n++
|
||||
} else if node.prefix != "" && HasPrefix(s, node.prefix) {
|
||||
n += len(node.prefix)
|
||||
s = s[len(node.prefix):]
|
||||
node = node.next
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
return n, nil
|
||||
// genericReplacer is the fully generic algorithm.
|
||||
// It's used as a fallback when nothing faster can be used.
|
||||
type genericReplacer struct {
|
||||
root trieNode
|
||||
// tableSize is the size of a trie node's lookup table. It is the number
|
||||
// of unique key bytes.
|
||||
tableSize int
|
||||
// mapping maps from key bytes to a dense index for trieNode.table.
|
||||
mapping [256]byte
|
||||
}
|
||||
|
||||
func makeGenericReplacer(oldnew []string) *genericReplacer {
|
||||
r := new(genericReplacer)
|
||||
// Find each byte used, then assign them each an index.
|
||||
for i := 0; i < len(oldnew); i += 2 {
|
||||
key := oldnew[i]
|
||||
for j := 0; j < len(key); j++ {
|
||||
r.mapping[key[j]] = 1
|
||||
}
|
||||
}
|
||||
|
||||
for _, b := range r.mapping {
|
||||
r.tableSize += int(b)
|
||||
}
|
||||
|
||||
var index byte
|
||||
for i, b := range r.mapping {
|
||||
if b == 0 {
|
||||
r.mapping[i] = byte(r.tableSize)
|
||||
} else {
|
||||
r.mapping[i] = index
|
||||
index++
|
||||
}
|
||||
}
|
||||
// Ensure root node uses a lookup table (for performance).
|
||||
r.root.table = make([]*trieNode, r.tableSize)
|
||||
|
||||
for i := 0; i < len(oldnew); i += 2 {
|
||||
r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
type appendSliceWriter []byte
|
||||
|
||||
// Write writes to the buffer to satisfy io.Writer.
|
||||
func (w *appendSliceWriter) Write(p []byte) (int, error) {
|
||||
*w = append(*w, p...)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// WriteString writes to the buffer without string->[]byte->string allocations.
|
||||
func (w *appendSliceWriter) WriteString(s string) (int, error) {
|
||||
*w = append(*w, s...)
|
||||
return len(s), nil
|
||||
}
|
||||
|
||||
type stringWriter struct {
|
||||
w io.Writer
|
||||
}
|
||||
|
||||
func (w stringWriter) WriteString(s string) (int, error) {
|
||||
return w.w.Write([]byte(s))
|
||||
}
|
||||
|
||||
func (r *genericReplacer) Replace(s string) string {
|
||||
buf := make(appendSliceWriter, 0, len(s))
|
||||
r.WriteString(&buf, s)
|
||||
return string(buf)
|
||||
}
|
||||
|
||||
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
|
||||
sw, ok := w.(interface {
|
||||
WriteString(string) (int, error)
|
||||
})
|
||||
if !ok {
|
||||
sw = stringWriter{w}
|
||||
}
|
||||
|
||||
var last, wn int
|
||||
var prevMatchEmpty bool
|
||||
for i := 0; i <= len(s); {
|
||||
// Ignore the empty match iff the previous loop found the empty match.
|
||||
val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
|
||||
prevMatchEmpty = match && keylen == 0
|
||||
if match {
|
||||
wn, err = sw.WriteString(s[last:i])
|
||||
n += wn
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
wn, err = sw.WriteString(val)
|
||||
n += wn
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
i += keylen
|
||||
last = i
|
||||
continue
|
||||
}
|
||||
i++
|
||||
}
|
||||
if last != len(s) {
|
||||
wn, err = sw.WriteString(s[last:])
|
||||
n += wn
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// byteReplacer is the implementation that's used when all the "old"
|
||||
@ -305,12 +480,3 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// strings is too low-level to import io/ioutil
|
||||
var discard io.Writer = devNull(0)
|
||||
|
||||
type devNull int
|
||||
|
||||
func (devNull) Write(p []byte) (int, error) {
|
||||
return len(p), nil
|
||||
}
|
||||
|
@ -219,21 +219,29 @@ func TestReplacer(t *testing.T) {
|
||||
|
||||
blankToX1 := NewReplacer("", "X")
|
||||
blankToX2 := NewReplacer("", "X", "", "")
|
||||
blankToXOToO := NewReplacer("", "X", "o", "O")
|
||||
blankHighPriority := NewReplacer("", "X", "o", "O")
|
||||
blankLowPriority := NewReplacer("o", "O", "", "X")
|
||||
blankNoOp1 := NewReplacer("", "")
|
||||
blankNoOp2 := NewReplacer("", "", "", "A")
|
||||
blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z")
|
||||
testCases = append(testCases,
|
||||
testCase{blankToX1, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
|
||||
testCase{blankToX1, "foo", "XfXoXoX"},
|
||||
testCase{blankToX1, "", "X"},
|
||||
|
||||
testCase{blankToX2, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
|
||||
testCase{blankToX2, "foo", "XfXoXoX"},
|
||||
testCase{blankToX2, "", "X"},
|
||||
|
||||
testCase{blankToXOToO, "oo", "XOXOX"},
|
||||
testCase{blankToXOToO, "ii", "XiiX"}, // TODO: should this be "XiXiX"?
|
||||
testCase{blankToXOToO, "iooi", "XiOXOXiX"}, // TODO: should this be "XiXOXOXiX"?
|
||||
testCase{blankToXOToO, "", "X"},
|
||||
testCase{blankHighPriority, "oo", "XOXOX"},
|
||||
testCase{blankHighPriority, "ii", "XiXiX"},
|
||||
testCase{blankHighPriority, "oiio", "XOXiXiXOX"},
|
||||
testCase{blankHighPriority, "iooi", "XiXOXOXiX"},
|
||||
testCase{blankHighPriority, "", "X"},
|
||||
|
||||
testCase{blankLowPriority, "oo", "OOX"},
|
||||
testCase{blankLowPriority, "ii", "XiXiX"},
|
||||
testCase{blankLowPriority, "oiio", "OXiXiOX"},
|
||||
testCase{blankLowPriority, "iooi", "XiOOXiX"},
|
||||
testCase{blankLowPriority, "", "X"},
|
||||
|
||||
testCase{blankNoOp1, "foo", "foo"},
|
||||
testCase{blankNoOp1, "", ""},
|
||||
@ -242,7 +250,7 @@ func TestReplacer(t *testing.T) {
|
||||
testCase{blankNoOp2, "", ""},
|
||||
|
||||
testCase{blankFoo, "foobarfoobaz", "XRXZX"},
|
||||
testCase{blankFoo, "foobar-foobaz", "XRX-ZX"}, // TODO: should this be "XRX-XZX"?
|
||||
testCase{blankFoo, "foobar-foobaz", "XRX-XZX"},
|
||||
testCase{blankFoo, "", "X"},
|
||||
)
|
||||
|
||||
@ -298,6 +306,78 @@ func TestPickAlgorithm(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestGenericTrieBuilding verifies the structure of the generated trie. There
|
||||
// is one node per line, and the key ending with the current line is in the
|
||||
// trie if it ends with a "+".
|
||||
func TestGenericTrieBuilding(t *testing.T) {
|
||||
testCases := []struct{ in, out string }{
|
||||
{"abc;abdef;abdefgh;xx;xy;z", `-
|
||||
a-
|
||||
.b-
|
||||
..c+
|
||||
..d-
|
||||
...ef+
|
||||
.....gh+
|
||||
x-
|
||||
.x+
|
||||
.y+
|
||||
z+
|
||||
`},
|
||||
{"abracadabra;abracadabrakazam;abraham;abrasion", `-
|
||||
a-
|
||||
.bra-
|
||||
....c-
|
||||
.....adabra+
|
||||
...........kazam+
|
||||
....h-
|
||||
.....am+
|
||||
....s-
|
||||
.....ion+
|
||||
`},
|
||||
{"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `-
|
||||
X+
|
||||
Y+
|
||||
a+
|
||||
.a+
|
||||
..a+
|
||||
i+
|
||||
l-
|
||||
.ong+
|
||||
....er+
|
||||
......st+
|
||||
x+
|
||||
.x+
|
||||
`},
|
||||
{"foo;;foo;foo1", `+
|
||||
f-
|
||||
.oo+
|
||||
...1+
|
||||
`},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
keys := Split(tc.in, ";")
|
||||
args := make([]string, len(keys)*2)
|
||||
for i, key := range keys {
|
||||
args[i*2] = key
|
||||
}
|
||||
|
||||
got := NewReplacer(args...).PrintTrie()
|
||||
// Remove tabs from tc.out
|
||||
wantbuf := make([]byte, 0, len(tc.out))
|
||||
for i := 0; i < len(tc.out); i++ {
|
||||
if tc.out[i] != '\t' {
|
||||
wantbuf = append(wantbuf, tc.out[i])
|
||||
}
|
||||
}
|
||||
want := string(wantbuf)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkGenericNoMatch(b *testing.B) {
|
||||
str := Repeat("A", 100) + Repeat("B", 100)
|
||||
generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic
|
||||
|
Loading…
Reference in New Issue
Block a user