mirror of
https://github.com/golang/go
synced 2024-11-26 08:58:09 -07:00
strings: implement a faster single-string Replacer
The string searching is implemented separately so other functions may make use of it in the future. benchmark old ns/op new ns/op delta BenchmarkSingleMaxSkipping 125889 2474 -98.03% BenchmarkSingleLongSuffixFail 16252 1996 -87.72% BenchmarkSingleMatch 260793 136266 -47.75% benchmark old MB/s new MB/s speedup BenchmarkSingleMaxSkipping 79.43 4041.57 50.88x BenchmarkSingleLongSuffixFail 61.65 501.81 8.14x BenchmarkSingleMatch 57.52 110.08 1.91x R=nigeltao CC=golang-dev https://golang.org/cl/6545049
This commit is contained in:
parent
4bf6249ba5
commit
631a0e71c1
@ -34,3 +34,12 @@ func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func StringFind(pattern, text string) int {
|
||||
return makeStringFinder(pattern).next(text)
|
||||
}
|
||||
|
||||
func DumpTables(pattern string) ([]int, []int) {
|
||||
finder := makeStringFinder(pattern)
|
||||
return finder.badCharSkip[:], finder.goodSuffixSkip
|
||||
}
|
||||
|
@ -33,6 +33,10 @@ func NewReplacer(oldnew ...string) *Replacer {
|
||||
panic("strings.NewReplacer: odd argument count")
|
||||
}
|
||||
|
||||
if len(oldnew) == 2 && len(oldnew[0]) > 1 {
|
||||
return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])}
|
||||
}
|
||||
|
||||
allNewBytes := true
|
||||
for i := 0; i < len(oldnew); i += 2 {
|
||||
if len(oldnew[i]) != 1 {
|
||||
@ -288,6 +292,10 @@ func (w *appendSliceWriter) WriteString(s string) (int, error) {
|
||||
return len(s), nil
|
||||
}
|
||||
|
||||
type stringWriterIface interface {
|
||||
WriteString(string) (int, error)
|
||||
}
|
||||
|
||||
type stringWriter struct {
|
||||
w io.Writer
|
||||
}
|
||||
@ -296,6 +304,14 @@ func (w stringWriter) WriteString(s string) (int, error) {
|
||||
return w.w.Write([]byte(s))
|
||||
}
|
||||
|
||||
func getStringWriter(w io.Writer) stringWriterIface {
|
||||
sw, ok := w.(stringWriterIface)
|
||||
if !ok {
|
||||
sw = stringWriter{w}
|
||||
}
|
||||
return sw
|
||||
}
|
||||
|
||||
func (r *genericReplacer) Replace(s string) string {
|
||||
buf := make(appendSliceWriter, 0, len(s))
|
||||
r.WriteString(&buf, s)
|
||||
@ -303,13 +319,7 @@ func (r *genericReplacer) Replace(s string) string {
|
||||
}
|
||||
|
||||
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
|
||||
sw, ok := w.(interface {
|
||||
WriteString(string) (int, error)
|
||||
})
|
||||
if !ok {
|
||||
sw = stringWriter{w}
|
||||
}
|
||||
|
||||
sw := getStringWriter(w)
|
||||
var last, wn int
|
||||
var prevMatchEmpty bool
|
||||
for i := 0; i <= len(s); {
|
||||
@ -340,6 +350,62 @@ func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error)
|
||||
return
|
||||
}
|
||||
|
||||
// singleStringReplacer is the implementation that's used when there is only
|
||||
// one string to replace (and that string has more than one byte).
|
||||
type singleStringReplacer struct {
|
||||
finder *stringFinder
|
||||
// value is the new string that replaces that pattern when it's found.
|
||||
value string
|
||||
}
|
||||
|
||||
func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
|
||||
return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
|
||||
}
|
||||
|
||||
func (r *singleStringReplacer) Replace(s string) string {
|
||||
var buf []byte
|
||||
i := 0
|
||||
for {
|
||||
match := r.finder.next(s[i:])
|
||||
if match == -1 {
|
||||
break
|
||||
}
|
||||
buf = append(buf, s[i:i+match]...)
|
||||
buf = append(buf, r.value...)
|
||||
i += match + len(r.finder.pattern)
|
||||
}
|
||||
if buf == nil {
|
||||
return s
|
||||
}
|
||||
buf = append(buf, s[i:]...)
|
||||
return string(buf)
|
||||
}
|
||||
|
||||
func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
|
||||
sw := getStringWriter(w)
|
||||
var i, wn int
|
||||
for {
|
||||
match := r.finder.next(s[i:])
|
||||
if match == -1 {
|
||||
break
|
||||
}
|
||||
wn, err = sw.WriteString(s[i : i+match])
|
||||
n += wn
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
wn, err = sw.WriteString(r.value)
|
||||
n += wn
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
i += match + len(r.finder.pattern)
|
||||
}
|
||||
wn, err = sw.WriteString(s[i:])
|
||||
n += wn
|
||||
return
|
||||
}
|
||||
|
||||
// byteReplacer is the implementation that's used when all the "old"
|
||||
// and "new" values are single ASCII bytes.
|
||||
type byteReplacer struct {
|
||||
|
@ -254,6 +254,17 @@ func TestReplacer(t *testing.T) {
|
||||
testCase{blankFoo, "", "X"},
|
||||
)
|
||||
|
||||
// single string replacer
|
||||
|
||||
abcMatcher := NewReplacer("abc", "[match]")
|
||||
|
||||
testCases = append(testCases,
|
||||
testCase{abcMatcher, "", ""},
|
||||
testCase{abcMatcher, "ab", "ab"},
|
||||
testCase{abcMatcher, "abcd", "[match]d"},
|
||||
testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"},
|
||||
)
|
||||
|
||||
// No-arg test cases.
|
||||
|
||||
nop := NewReplacer()
|
||||
@ -294,8 +305,9 @@ func TestPickAlgorithm(t *testing.T) {
|
||||
}{
|
||||
{capitalLetters, "*strings.byteReplacer"},
|
||||
{htmlEscaper, "*strings.byteStringReplacer"},
|
||||
{NewReplacer("12", "123"), "*strings.genericReplacer"},
|
||||
{NewReplacer("12", "123"), "*strings.singleStringReplacer"},
|
||||
{NewReplacer("1", "12"), "*strings.byteStringReplacer"},
|
||||
{NewReplacer("", "X"), "*strings.genericReplacer"},
|
||||
{NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"},
|
||||
}
|
||||
for i, tc := range testCases {
|
||||
@ -401,6 +413,27 @@ func BenchmarkGenericMatch2(b *testing.B) {
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkSingleString(b *testing.B, pattern, text string) {
|
||||
r := NewReplacer(pattern, "[match]")
|
||||
b.SetBytes(int64(len(text)))
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
r.Replace(text)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSingleMaxSkipping(b *testing.B) {
|
||||
benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000))
|
||||
}
|
||||
|
||||
func BenchmarkSingleLongSuffixFail(b *testing.B) {
|
||||
benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002))
|
||||
}
|
||||
|
||||
func BenchmarkSingleMatch(b *testing.B) {
|
||||
benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000))
|
||||
}
|
||||
|
||||
func BenchmarkByteByteNoMatch(b *testing.B) {
|
||||
str := Repeat("A", 100) + Repeat("B", 100)
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
124
src/pkg/strings/search.go
Normal file
124
src/pkg/strings/search.go
Normal file
@ -0,0 +1,124 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package strings
|
||||
|
||||
// stringFinder efficiently finds strings in a source text. It's implemented
|
||||
// using the Boyer-Moore string search algorithm:
|
||||
// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
|
||||
// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
|
||||
// document uses 1-based indexing)
|
||||
type stringFinder struct {
|
||||
// pattern is the string that we are searching for in the text.
|
||||
pattern string
|
||||
|
||||
// badCharSkip[b] contains the distance between the last byte of pattern
|
||||
// and the rightmost occurrence of b in pattern. If b is not in pattern,
|
||||
// badCharSkip[b] is len(pattern).
|
||||
//
|
||||
// Whenever a mismatch is found with byte b in the text, we can safely
|
||||
// shift the matching frame at least badCharSkip[b] until the next time
|
||||
// the matching char could be in alignment.
|
||||
badCharSkip [256]int
|
||||
|
||||
// goodSuffixSkip[i] defines how far we can shift the matching frame given
|
||||
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
|
||||
// not. There are two cases to consider:
|
||||
//
|
||||
// 1. The matched suffix occurs elsewhere in pattern (with a different
|
||||
// byte preceding it that we might possibly match). In this case, we can
|
||||
// shift the matching frame to align with the next suffix chunk. For
|
||||
// example, the pattern "mississi" has the suffix "issi" next occurring
|
||||
// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
|
||||
// shift+len(suffix) == 3+4 == 7.
|
||||
//
|
||||
// 2. If the matched suffix does not occur elsewhere in pattern, then the
|
||||
// matching frame may share part of its prefix with the end of the
|
||||
// matching suffix. In this case, goodSuffixSkip[i] will contain how far
|
||||
// to shift the frame to align this portion of the prefix to the
|
||||
// suffix. For example, in the pattern "abcxxxabc", when the first
|
||||
// mismatch from the back is found to be in position 3, the matching
|
||||
// suffix "xxabc" is not found elsewhere in the pattern. However, its
|
||||
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
|
||||
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
|
||||
goodSuffixSkip []int
|
||||
}
|
||||
|
||||
func makeStringFinder(pattern string) *stringFinder {
|
||||
f := &stringFinder{
|
||||
pattern: pattern,
|
||||
goodSuffixSkip: make([]int, len(pattern)),
|
||||
}
|
||||
// last is the index of the last character in the pattern.
|
||||
last := len(pattern) - 1
|
||||
|
||||
// Build bad character table.
|
||||
// Bytes not in the pattern can skip one pattern's length.
|
||||
for i := range f.badCharSkip {
|
||||
f.badCharSkip[i] = len(pattern)
|
||||
}
|
||||
// The loop condition is < instead of <= so that the last byte does not
|
||||
// have a zero distance to itself. Finding this byte out of place implies
|
||||
// that it is not in the last position.
|
||||
for i := 0; i < last; i++ {
|
||||
f.badCharSkip[pattern[i]] = last - i
|
||||
}
|
||||
|
||||
// Build good suffix table.
|
||||
// First pass: set each value to the next index which starts a prefix of
|
||||
// pattern.
|
||||
lastPrefix := last
|
||||
for i := last; i >= 0; i-- {
|
||||
if HasPrefix(pattern, pattern[i+1:]) {
|
||||
lastPrefix = i + 1
|
||||
}
|
||||
// lastPrefix is the shift, and (last-i) is len(suffix).
|
||||
f.goodSuffixSkip[i] = lastPrefix + last - i
|
||||
}
|
||||
// Second pass: find repeats of pattern's suffix starting from the front.
|
||||
for i := 0; i < last; i++ {
|
||||
lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
|
||||
if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
|
||||
// (last-i) is the shift, and lenSuffix is len(suffix).
|
||||
f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
|
||||
}
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func longestCommonSuffix(a, b string) (i int) {
|
||||
for ; i < len(a) && i < len(b); i++ {
|
||||
if a[len(a)-1-i] != b[len(b)-1-i] {
|
||||
break
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// next returns the index in text of the first occurrence of the pattern. If
|
||||
// the pattern is not found, it returns -1.
|
||||
func (f *stringFinder) next(text string) int {
|
||||
i := len(f.pattern) - 1
|
||||
for i < len(text) {
|
||||
// Compare backwards from the end until the first unmatching character.
|
||||
j := len(f.pattern) - 1
|
||||
for j >= 0 && text[i] == f.pattern[j] {
|
||||
i--
|
||||
j--
|
||||
}
|
||||
if j < 0 {
|
||||
return i + 1 // match
|
||||
}
|
||||
i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
90
src/pkg/strings/search_test.go
Normal file
90
src/pkg/strings/search_test.go
Normal file
@ -0,0 +1,90 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package strings_test
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
. "strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFinderNext(t *testing.T) {
|
||||
testCases := []struct {
|
||||
pat, text string
|
||||
index int
|
||||
}{
|
||||
{"", "", 0},
|
||||
{"", "abc", 0},
|
||||
{"abc", "", -1},
|
||||
{"abc", "abc", 0},
|
||||
{"d", "abcdefg", 3},
|
||||
{"nan", "banana", 2},
|
||||
{"pan", "anpanman", 2},
|
||||
{"nnaaman", "anpanmanam", -1},
|
||||
{"abcd", "abc", -1},
|
||||
{"abcd", "bcd", -1},
|
||||
{"bcd", "abcd", 1},
|
||||
{"abc", "acca", -1},
|
||||
{"aa", "aaa", 0},
|
||||
{"baa", "aaaaa", -1},
|
||||
{"at that", "which finally halts. at that point", 22},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
got := StringFind(tc.pat, tc.text)
|
||||
want := tc.index
|
||||
if got != want {
|
||||
t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFinderCreation(t *testing.T) {
|
||||
testCases := []struct {
|
||||
pattern string
|
||||
bad [256]int
|
||||
suf []int
|
||||
}{
|
||||
{
|
||||
"abc",
|
||||
[256]int{'a': 2, 'b': 1, 'c': 3},
|
||||
[]int{5, 4, 1},
|
||||
},
|
||||
{
|
||||
"mississi",
|
||||
[256]int{'i': 3, 'm': 7, 's': 1},
|
||||
[]int{15, 14, 13, 7, 11, 10, 7, 1},
|
||||
},
|
||||
// From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
|
||||
{
|
||||
"abcxxxabc",
|
||||
[256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3},
|
||||
[]int{14, 13, 12, 11, 10, 9, 11, 10, 1},
|
||||
},
|
||||
{
|
||||
"abyxcdeyx",
|
||||
[256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5},
|
||||
[]int{17, 16, 15, 14, 13, 12, 7, 10, 1},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
bad, good := DumpTables(tc.pattern)
|
||||
|
||||
for i, got := range bad {
|
||||
want := tc.bad[i]
|
||||
if want == 0 {
|
||||
want = len(tc.pattern)
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want)
|
||||
}
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(good, tc.suf) {
|
||||
t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user