1
0
mirror of https://github.com/golang/go synced 2024-11-24 21:20:05 -07:00

regexp: fix performance bug, make anchored searches fail fast.

The bug was that for an anchored pattern such as ^x, the prefix
scan ignored the anchor, and could scan the whole file if there was
no x present.  The fix is to do prefix matching after the anchor;
the cost miniscule; the speedups huge.

R=rsc, gri
CC=golang-dev
https://golang.org/cl/3837042
This commit is contained in:
Rob Pike 2011-01-03 11:31:51 -08:00
parent 84fc1e20f1
commit c0d0d4ef05
2 changed files with 74 additions and 7 deletions

View File

@ -377,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) {
re.ReplaceAllString(x, "")
}
}
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredShortMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLongMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}

View File

@ -571,15 +571,20 @@ func (re *Regexp) doParse() {
}
}
// Extract regular text from the beginning of the pattern.
// Extract regular text from the beginning of the pattern,
// possibly after a leading iBOT.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
var b []byte
var utf = make([]byte, utf8.UTFMax)
var inst *instr
// First instruction is start; skip that.
// First instruction is start; skip that. Also skip any initial iBOT.
inst = re.inst[0].next
for inst.kind == iBOT {
inst = inst.next
}
Loop:
for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
for ; inst.kind != iEnd; inst = inst.next {
// stop if this is not a char
if inst.kind != iChar {
break
@ -748,14 +753,30 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
if bytestr != nil {
end = len(bytestr)
}
anchored := re.inst[0].next.kind == iBOT
if anchored && pos > 0 {
return nil
}
// fast check for initial plain substring
prefixed := false // has this iteration begun by skipping a prefix?
if re.prefix != "" {
var advance int
if bytestr == nil {
advance = strings.Index(str[pos:], re.prefix)
advance := 0
if anchored {
if bytestr == nil {
if !strings.HasPrefix(str, re.prefix) {
return nil
}
} else {
if !bytes.HasPrefix(bytestr, re.prefixBytes) {
return nil
}
}
} else {
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
if bytestr == nil {
advance = strings.Index(str[pos:], re.prefix)
} else {
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
}
}
if advance == -1 {
return nil