From 631a0e71c1eb1e85c3e745153a6575a82189ef3e Mon Sep 17 00:00:00 2001 From: Eric Roshan-Eisner Date: Fri, 28 Sep 2012 12:34:18 +1000 Subject: [PATCH] strings: implement a faster single-string Replacer The string searching is implemented separately so other functions may make use of it in the future. benchmark old ns/op new ns/op delta BenchmarkSingleMaxSkipping 125889 2474 -98.03% BenchmarkSingleLongSuffixFail 16252 1996 -87.72% BenchmarkSingleMatch 260793 136266 -47.75% benchmark old MB/s new MB/s speedup BenchmarkSingleMaxSkipping 79.43 4041.57 50.88x BenchmarkSingleLongSuffixFail 61.65 501.81 8.14x BenchmarkSingleMatch 57.52 110.08 1.91x R=nigeltao CC=golang-dev https://golang.org/cl/6545049 --- src/pkg/strings/export_test.go | 9 +++ src/pkg/strings/replace.go | 80 +++++++++++++++++++-- src/pkg/strings/replace_test.go | 35 ++++++++- src/pkg/strings/search.go | 124 ++++++++++++++++++++++++++++++++ src/pkg/strings/search_test.go | 90 +++++++++++++++++++++++ 5 files changed, 330 insertions(+), 8 deletions(-) create mode 100644 src/pkg/strings/search.go create mode 100644 src/pkg/strings/search_test.go diff --git a/src/pkg/strings/export_test.go b/src/pkg/strings/export_test.go index 3a8c8514065..17c806aa563 100644 --- a/src/pkg/strings/export_test.go +++ b/src/pkg/strings/export_test.go @@ -34,3 +34,12 @@ func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) { } return } + +func StringFind(pattern, text string) int { + return makeStringFinder(pattern).next(text) +} + +func DumpTables(pattern string) ([]int, []int) { + finder := makeStringFinder(pattern) + return finder.badCharSkip[:], finder.goodSuffixSkip +} diff --git a/src/pkg/strings/replace.go b/src/pkg/strings/replace.go index d863eeb5c63..f63b1792c56 100644 --- a/src/pkg/strings/replace.go +++ b/src/pkg/strings/replace.go @@ -33,6 +33,10 @@ func NewReplacer(oldnew ...string) *Replacer { panic("strings.NewReplacer: odd argument count") } + if len(oldnew) == 2 && len(oldnew[0]) > 1 { + return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])} + } + allNewBytes := true for i := 0; i < len(oldnew); i += 2 { if len(oldnew[i]) != 1 { @@ -288,6 +292,10 @@ func (w *appendSliceWriter) WriteString(s string) (int, error) { return len(s), nil } +type stringWriterIface interface { + WriteString(string) (int, error) +} + type stringWriter struct { w io.Writer } @@ -296,6 +304,14 @@ func (w stringWriter) WriteString(s string) (int, error) { return w.w.Write([]byte(s)) } +func getStringWriter(w io.Writer) stringWriterIface { + sw, ok := w.(stringWriterIface) + if !ok { + sw = stringWriter{w} + } + return sw +} + func (r *genericReplacer) Replace(s string) string { buf := make(appendSliceWriter, 0, len(s)) r.WriteString(&buf, s) @@ -303,13 +319,7 @@ func (r *genericReplacer) Replace(s string) string { } func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { - sw, ok := w.(interface { - WriteString(string) (int, error) - }) - if !ok { - sw = stringWriter{w} - } - + sw := getStringWriter(w) var last, wn int var prevMatchEmpty bool for i := 0; i <= len(s); { @@ -340,6 +350,62 @@ func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) return } +// singleStringReplacer is the implementation that's used when there is only +// one string to replace (and that string has more than one byte). +type singleStringReplacer struct { + finder *stringFinder + // value is the new string that replaces that pattern when it's found. + value string +} + +func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer { + return &singleStringReplacer{finder: makeStringFinder(pattern), value: value} +} + +func (r *singleStringReplacer) Replace(s string) string { + var buf []byte + i := 0 + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + buf = append(buf, s[i:i+match]...) + buf = append(buf, r.value...) + i += match + len(r.finder.pattern) + } + if buf == nil { + return s + } + buf = append(buf, s[i:]...) + return string(buf) +} + +func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var i, wn int + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + wn, err = sw.WriteString(s[i : i+match]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(r.value) + n += wn + if err != nil { + return + } + i += match + len(r.finder.pattern) + } + wn, err = sw.WriteString(s[i:]) + n += wn + return +} + // byteReplacer is the implementation that's used when all the "old" // and "new" values are single ASCII bytes. type byteReplacer struct { diff --git a/src/pkg/strings/replace_test.go b/src/pkg/strings/replace_test.go index edc990c18b8..d33dea95b09 100644 --- a/src/pkg/strings/replace_test.go +++ b/src/pkg/strings/replace_test.go @@ -254,6 +254,17 @@ func TestReplacer(t *testing.T) { testCase{blankFoo, "", "X"}, ) + // single string replacer + + abcMatcher := NewReplacer("abc", "[match]") + + testCases = append(testCases, + testCase{abcMatcher, "", ""}, + testCase{abcMatcher, "ab", "ab"}, + testCase{abcMatcher, "abcd", "[match]d"}, + testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"}, + ) + // No-arg test cases. nop := NewReplacer() @@ -294,8 +305,9 @@ func TestPickAlgorithm(t *testing.T) { }{ {capitalLetters, "*strings.byteReplacer"}, {htmlEscaper, "*strings.byteStringReplacer"}, - {NewReplacer("12", "123"), "*strings.genericReplacer"}, + {NewReplacer("12", "123"), "*strings.singleStringReplacer"}, {NewReplacer("1", "12"), "*strings.byteStringReplacer"}, + {NewReplacer("", "X"), "*strings.genericReplacer"}, {NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"}, } for i, tc := range testCases { @@ -401,6 +413,27 @@ func BenchmarkGenericMatch2(b *testing.B) { } } +func benchmarkSingleString(b *testing.B, pattern, text string) { + r := NewReplacer(pattern, "[match]") + b.SetBytes(int64(len(text))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.Replace(text) + } +} + +func BenchmarkSingleMaxSkipping(b *testing.B) { + benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000)) +} + +func BenchmarkSingleLongSuffixFail(b *testing.B) { + benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002)) +} + +func BenchmarkSingleMatch(b *testing.B) { + benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000)) +} + func BenchmarkByteByteNoMatch(b *testing.B) { str := Repeat("A", 100) + Repeat("B", 100) for i := 0; i < b.N; i++ { diff --git a/src/pkg/strings/search.go b/src/pkg/strings/search.go new file mode 100644 index 00000000000..f77c879c572 --- /dev/null +++ b/src/pkg/strings/search.go @@ -0,0 +1,124 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// stringFinder efficiently finds strings in a source text. It's implemented +// using the Boyer-Moore string search algorithm: +// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm +// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged +// document uses 1-based indexing) +type stringFinder struct { + // pattern is the string that we are searching for in the text. + pattern string + + // badCharSkip[b] contains the distance between the last byte of pattern + // and the rightmost occurrence of b in pattern. If b is not in pattern, + // badCharSkip[b] is len(pattern). + // + // Whenever a mismatch is found with byte b in the text, we can safely + // shift the matching frame at least badCharSkip[b] until the next time + // the matching char could be in alignment. + badCharSkip [256]int + + // goodSuffixSkip[i] defines how far we can shift the matching frame given + // that the suffix pattern[i+1:] matches, but the byte pattern[i] does + // not. There are two cases to consider: + // + // 1. The matched suffix occurs elsewhere in pattern (with a different + // byte preceding it that we might possibly match). In this case, we can + // shift the matching frame to align with the next suffix chunk. For + // example, the pattern "mississi" has the suffix "issi" next occurring + // (in right-to-left order) at index 1, so goodSuffixSkip[3] == + // shift+len(suffix) == 3+4 == 7. + // + // 2. If the matched suffix does not occur elsewhere in pattern, then the + // matching frame may share part of its prefix with the end of the + // matching suffix. In this case, goodSuffixSkip[i] will contain how far + // to shift the frame to align this portion of the prefix to the + // suffix. For example, in the pattern "abcxxxabc", when the first + // mismatch from the back is found to be in position 3, the matching + // suffix "xxabc" is not found elsewhere in the pattern. However, its + // rightmost "abc" (at position 6) is a prefix of the whole pattern, so + // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. + goodSuffixSkip []int +} + +func makeStringFinder(pattern string) *stringFinder { + f := &stringFinder{ + pattern: pattern, + goodSuffixSkip: make([]int, len(pattern)), + } + // last is the index of the last character in the pattern. + last := len(pattern) - 1 + + // Build bad character table. + // Bytes not in the pattern can skip one pattern's length. + for i := range f.badCharSkip { + f.badCharSkip[i] = len(pattern) + } + // The loop condition is < instead of <= so that the last byte does not + // have a zero distance to itself. Finding this byte out of place implies + // that it is not in the last position. + for i := 0; i < last; i++ { + f.badCharSkip[pattern[i]] = last - i + } + + // Build good suffix table. + // First pass: set each value to the next index which starts a prefix of + // pattern. + lastPrefix := last + for i := last; i >= 0; i-- { + if HasPrefix(pattern, pattern[i+1:]) { + lastPrefix = i + 1 + } + // lastPrefix is the shift, and (last-i) is len(suffix). + f.goodSuffixSkip[i] = lastPrefix + last - i + } + // Second pass: find repeats of pattern's suffix starting from the front. + for i := 0; i < last; i++ { + lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) + if pattern[i-lenSuffix] != pattern[last-lenSuffix] { + // (last-i) is the shift, and lenSuffix is len(suffix). + f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i + } + } + + return f +} + +func longestCommonSuffix(a, b string) (i int) { + for ; i < len(a) && i < len(b); i++ { + if a[len(a)-1-i] != b[len(b)-1-i] { + break + } + } + return +} + +// next returns the index in text of the first occurrence of the pattern. If +// the pattern is not found, it returns -1. +func (f *stringFinder) next(text string) int { + i := len(f.pattern) - 1 + for i < len(text) { + // Compare backwards from the end until the first unmatching character. + j := len(f.pattern) - 1 + for j >= 0 && text[i] == f.pattern[j] { + i-- + j-- + } + if j < 0 { + return i + 1 // match + } + i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) + } + return -1 +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/src/pkg/strings/search_test.go b/src/pkg/strings/search_test.go new file mode 100644 index 00000000000..966c05e65ad --- /dev/null +++ b/src/pkg/strings/search_test.go @@ -0,0 +1,90 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "reflect" + . "strings" + "testing" +) + +func TestFinderNext(t *testing.T) { + testCases := []struct { + pat, text string + index int + }{ + {"", "", 0}, + {"", "abc", 0}, + {"abc", "", -1}, + {"abc", "abc", 0}, + {"d", "abcdefg", 3}, + {"nan", "banana", 2}, + {"pan", "anpanman", 2}, + {"nnaaman", "anpanmanam", -1}, + {"abcd", "abc", -1}, + {"abcd", "bcd", -1}, + {"bcd", "abcd", 1}, + {"abc", "acca", -1}, + {"aa", "aaa", 0}, + {"baa", "aaaaa", -1}, + {"at that", "which finally halts. at that point", 22}, + } + + for _, tc := range testCases { + got := StringFind(tc.pat, tc.text) + want := tc.index + if got != want { + t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want) + } + } +} + +func TestFinderCreation(t *testing.T) { + testCases := []struct { + pattern string + bad [256]int + suf []int + }{ + { + "abc", + [256]int{'a': 2, 'b': 1, 'c': 3}, + []int{5, 4, 1}, + }, + { + "mississi", + [256]int{'i': 3, 'm': 7, 's': 1}, + []int{15, 14, 13, 7, 11, 10, 7, 1}, + }, + // From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf + { + "abcxxxabc", + [256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3}, + []int{14, 13, 12, 11, 10, 9, 11, 10, 1}, + }, + { + "abyxcdeyx", + [256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5}, + []int{17, 16, 15, 14, 13, 12, 7, 10, 1}, + }, + } + + for _, tc := range testCases { + bad, good := DumpTables(tc.pattern) + + for i, got := range bad { + want := tc.bad[i] + if want == 0 { + want = len(tc.pattern) + } + if got != want { + t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want) + } + } + + if !reflect.DeepEqual(good, tc.suf) { + t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf) + } + } +}