1
0
mirror of https://github.com/golang/go synced 2024-11-26 08:58:09 -07:00

strings: implement a faster single-string Replacer

The string searching is implemented separately so other functions
may make use of it in the future.

benchmark                        old ns/op    new ns/op    delta
BenchmarkSingleMaxSkipping          125889         2474  -98.03%
BenchmarkSingleLongSuffixFail        16252         1996  -87.72%
BenchmarkSingleMatch                260793       136266  -47.75%

benchmark                         old MB/s     new MB/s  speedup
BenchmarkSingleMaxSkipping           79.43      4041.57   50.88x
BenchmarkSingleLongSuffixFail        61.65       501.81    8.14x
BenchmarkSingleMatch                 57.52       110.08    1.91x

R=nigeltao
CC=golang-dev
https://golang.org/cl/6545049
This commit is contained in:
Eric Roshan-Eisner 2012-09-28 12:34:18 +10:00 committed by Nigel Tao
parent 4bf6249ba5
commit 631a0e71c1
5 changed files with 330 additions and 8 deletions

View File

@ -34,3 +34,12 @@ func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
}
return
}
func StringFind(pattern, text string) int {
return makeStringFinder(pattern).next(text)
}
func DumpTables(pattern string) ([]int, []int) {
finder := makeStringFinder(pattern)
return finder.badCharSkip[:], finder.goodSuffixSkip
}

View File

@ -33,6 +33,10 @@ func NewReplacer(oldnew ...string) *Replacer {
panic("strings.NewReplacer: odd argument count")
}
if len(oldnew) == 2 && len(oldnew[0]) > 1 {
return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])}
}
allNewBytes := true
for i := 0; i < len(oldnew); i += 2 {
if len(oldnew[i]) != 1 {
@ -288,6 +292,10 @@ func (w *appendSliceWriter) WriteString(s string) (int, error) {
return len(s), nil
}
type stringWriterIface interface {
WriteString(string) (int, error)
}
type stringWriter struct {
w io.Writer
}
@ -296,6 +304,14 @@ func (w stringWriter) WriteString(s string) (int, error) {
return w.w.Write([]byte(s))
}
func getStringWriter(w io.Writer) stringWriterIface {
sw, ok := w.(stringWriterIface)
if !ok {
sw = stringWriter{w}
}
return sw
}
func (r *genericReplacer) Replace(s string) string {
buf := make(appendSliceWriter, 0, len(s))
r.WriteString(&buf, s)
@ -303,13 +319,7 @@ func (r *genericReplacer) Replace(s string) string {
}
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
sw, ok := w.(interface {
WriteString(string) (int, error)
})
if !ok {
sw = stringWriter{w}
}
sw := getStringWriter(w)
var last, wn int
var prevMatchEmpty bool
for i := 0; i <= len(s); {
@ -340,6 +350,62 @@ func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error)
return
}
// singleStringReplacer is the implementation that's used when there is only
// one string to replace (and that string has more than one byte).
type singleStringReplacer struct {
finder *stringFinder
// value is the new string that replaces that pattern when it's found.
value string
}
func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
}
func (r *singleStringReplacer) Replace(s string) string {
var buf []byte
i := 0
for {
match := r.finder.next(s[i:])
if match == -1 {
break
}
buf = append(buf, s[i:i+match]...)
buf = append(buf, r.value...)
i += match + len(r.finder.pattern)
}
if buf == nil {
return s
}
buf = append(buf, s[i:]...)
return string(buf)
}
func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
sw := getStringWriter(w)
var i, wn int
for {
match := r.finder.next(s[i:])
if match == -1 {
break
}
wn, err = sw.WriteString(s[i : i+match])
n += wn
if err != nil {
return
}
wn, err = sw.WriteString(r.value)
n += wn
if err != nil {
return
}
i += match + len(r.finder.pattern)
}
wn, err = sw.WriteString(s[i:])
n += wn
return
}
// byteReplacer is the implementation that's used when all the "old"
// and "new" values are single ASCII bytes.
type byteReplacer struct {

View File

@ -254,6 +254,17 @@ func TestReplacer(t *testing.T) {
testCase{blankFoo, "", "X"},
)
// single string replacer
abcMatcher := NewReplacer("abc", "[match]")
testCases = append(testCases,
testCase{abcMatcher, "", ""},
testCase{abcMatcher, "ab", "ab"},
testCase{abcMatcher, "abcd", "[match]d"},
testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"},
)
// No-arg test cases.
nop := NewReplacer()
@ -294,8 +305,9 @@ func TestPickAlgorithm(t *testing.T) {
}{
{capitalLetters, "*strings.byteReplacer"},
{htmlEscaper, "*strings.byteStringReplacer"},
{NewReplacer("12", "123"), "*strings.genericReplacer"},
{NewReplacer("12", "123"), "*strings.singleStringReplacer"},
{NewReplacer("1", "12"), "*strings.byteStringReplacer"},
{NewReplacer("", "X"), "*strings.genericReplacer"},
{NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"},
}
for i, tc := range testCases {
@ -401,6 +413,27 @@ func BenchmarkGenericMatch2(b *testing.B) {
}
}
func benchmarkSingleString(b *testing.B, pattern, text string) {
r := NewReplacer(pattern, "[match]")
b.SetBytes(int64(len(text)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
r.Replace(text)
}
}
func BenchmarkSingleMaxSkipping(b *testing.B) {
benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000))
}
func BenchmarkSingleLongSuffixFail(b *testing.B) {
benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002))
}
func BenchmarkSingleMatch(b *testing.B) {
benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000))
}
func BenchmarkByteByteNoMatch(b *testing.B) {
str := Repeat("A", 100) + Repeat("B", 100)
for i := 0; i < b.N; i++ {

124
src/pkg/strings/search.go Normal file
View File

@ -0,0 +1,124 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
// stringFinder efficiently finds strings in a source text. It's implemented
// using the Boyer-Moore string search algorithm:
// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
// document uses 1-based indexing)
type stringFinder struct {
// pattern is the string that we are searching for in the text.
pattern string
// badCharSkip[b] contains the distance between the last byte of pattern
// and the rightmost occurrence of b in pattern. If b is not in pattern,
// badCharSkip[b] is len(pattern).
//
// Whenever a mismatch is found with byte b in the text, we can safely
// shift the matching frame at least badCharSkip[b] until the next time
// the matching char could be in alignment.
badCharSkip [256]int
// goodSuffixSkip[i] defines how far we can shift the matching frame given
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
// not. There are two cases to consider:
//
// 1. The matched suffix occurs elsewhere in pattern (with a different
// byte preceding it that we might possibly match). In this case, we can
// shift the matching frame to align with the next suffix chunk. For
// example, the pattern "mississi" has the suffix "issi" next occurring
// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
// shift+len(suffix) == 3+4 == 7.
//
// 2. If the matched suffix does not occur elsewhere in pattern, then the
// matching frame may share part of its prefix with the end of the
// matching suffix. In this case, goodSuffixSkip[i] will contain how far
// to shift the frame to align this portion of the prefix to the
// suffix. For example, in the pattern "abcxxxabc", when the first
// mismatch from the back is found to be in position 3, the matching
// suffix "xxabc" is not found elsewhere in the pattern. However, its
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
goodSuffixSkip []int
}
func makeStringFinder(pattern string) *stringFinder {
f := &stringFinder{
pattern: pattern,
goodSuffixSkip: make([]int, len(pattern)),
}
// last is the index of the last character in the pattern.
last := len(pattern) - 1
// Build bad character table.
// Bytes not in the pattern can skip one pattern's length.
for i := range f.badCharSkip {
f.badCharSkip[i] = len(pattern)
}
// The loop condition is < instead of <= so that the last byte does not
// have a zero distance to itself. Finding this byte out of place implies
// that it is not in the last position.
for i := 0; i < last; i++ {
f.badCharSkip[pattern[i]] = last - i
}
// Build good suffix table.
// First pass: set each value to the next index which starts a prefix of
// pattern.
lastPrefix := last
for i := last; i >= 0; i-- {
if HasPrefix(pattern, pattern[i+1:]) {
lastPrefix = i + 1
}
// lastPrefix is the shift, and (last-i) is len(suffix).
f.goodSuffixSkip[i] = lastPrefix + last - i
}
// Second pass: find repeats of pattern's suffix starting from the front.
for i := 0; i < last; i++ {
lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
// (last-i) is the shift, and lenSuffix is len(suffix).
f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
}
}
return f
}
func longestCommonSuffix(a, b string) (i int) {
for ; i < len(a) && i < len(b); i++ {
if a[len(a)-1-i] != b[len(b)-1-i] {
break
}
}
return
}
// next returns the index in text of the first occurrence of the pattern. If
// the pattern is not found, it returns -1.
func (f *stringFinder) next(text string) int {
i := len(f.pattern) - 1
for i < len(text) {
// Compare backwards from the end until the first unmatching character.
j := len(f.pattern) - 1
for j >= 0 && text[i] == f.pattern[j] {
i--
j--
}
if j < 0 {
return i + 1 // match
}
i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
}
return -1
}
func max(a, b int) int {
if a > b {
return a
}
return b
}

View File

@ -0,0 +1,90 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings_test
import (
"reflect"
. "strings"
"testing"
)
func TestFinderNext(t *testing.T) {
testCases := []struct {
pat, text string
index int
}{
{"", "", 0},
{"", "abc", 0},
{"abc", "", -1},
{"abc", "abc", 0},
{"d", "abcdefg", 3},
{"nan", "banana", 2},
{"pan", "anpanman", 2},
{"nnaaman", "anpanmanam", -1},
{"abcd", "abc", -1},
{"abcd", "bcd", -1},
{"bcd", "abcd", 1},
{"abc", "acca", -1},
{"aa", "aaa", 0},
{"baa", "aaaaa", -1},
{"at that", "which finally halts. at that point", 22},
}
for _, tc := range testCases {
got := StringFind(tc.pat, tc.text)
want := tc.index
if got != want {
t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want)
}
}
}
func TestFinderCreation(t *testing.T) {
testCases := []struct {
pattern string
bad [256]int
suf []int
}{
{
"abc",
[256]int{'a': 2, 'b': 1, 'c': 3},
[]int{5, 4, 1},
},
{
"mississi",
[256]int{'i': 3, 'm': 7, 's': 1},
[]int{15, 14, 13, 7, 11, 10, 7, 1},
},
// From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
{
"abcxxxabc",
[256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3},
[]int{14, 13, 12, 11, 10, 9, 11, 10, 1},
},
{
"abyxcdeyx",
[256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5},
[]int{17, 16, 15, 14, 13, 12, 7, 10, 1},
},
}
for _, tc := range testCases {
bad, good := DumpTables(tc.pattern)
for i, got := range bad {
want := tc.bad[i]
if want == 0 {
want = len(tc.pattern)
}
if got != want {
t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want)
}
}
if !reflect.DeepEqual(good, tc.suf) {
t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf)
}
}
}