regexp: fix performance bug, make anchored searches fail fast.

The bug was that for an anchored pattern such as ^x, the prefix scan ignored the anchor, and could scan the whole file if there was no x present. The fix is to do prefix matching after the anchor; the cost miniscule; the speedups huge. R=rsc, gri CC=golang-dev https://golang.org/cl/3837042
2024-11-24 21:20:05 -07:00 · 2011-01-03 11:31:51 -08:00 · 2011-01-03 11:31:51 -08:00 · c0d0d4ef05
commit c0d0d4ef05
parent 84fc1e20f1
2 changed files with 74 additions and 7 deletions
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@ -377,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) {
 		re.ReplaceAllString(x, "")
 	}
 }
+
+func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	re := MustCompile("^zbc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	for i := 0; i < 15; i++ {
+		x = append(x, x...)
+	}
+	re := MustCompile("^zbc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredShortMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	re := MustCompile("^.bc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredLongMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	for i := 0; i < 15; i++ {
+		x = append(x, x...)
+	}
+	re := MustCompile("^.bc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@ -571,15 +571,20 @@ func (re *Regexp) doParse() {
 	}
 }

-// Extract regular text from the beginning of the pattern.
+// Extract regular text from the beginning of the pattern,
+// possibly after a leading iBOT.
 // That text can be used by doExecute to speed up matching.
 func (re *Regexp) setPrefix() {
 	var b []byte
 	var utf = make([]byte, utf8.UTFMax)
 	var inst *instr
-	// First instruction is start; skip that.
+	// First instruction is start; skip that.  Also skip any initial iBOT.
+	inst = re.inst[0].next
+	for inst.kind == iBOT {
+		inst = inst.next
+	}
 Loop:
-	for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
+	for ; inst.kind != iEnd; inst = inst.next {
 		// stop if this is not a char
 		if inst.kind != iChar {
 			break
@ -748,14 +753,30 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
 	if bytestr != nil {
 		end = len(bytestr)
 	}
+	anchored := re.inst[0].next.kind == iBOT
+	if anchored && pos > 0 {
+		return nil
+	}
 	// fast check for initial plain substring
 	prefixed := false // has this iteration begun by skipping a prefix?
 	if re.prefix != "" {
-		var advance int
-		if bytestr == nil {
-			advance = strings.Index(str[pos:], re.prefix)
+		advance := 0
+		if anchored {
+			if bytestr == nil {
+				if !strings.HasPrefix(str, re.prefix) {
+					return nil
+				}
+			} else {
+				if !bytes.HasPrefix(bytestr, re.prefixBytes) {
+					return nil
+				}
+			}
 		} else {
-			advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+			if bytestr == nil {
+				advance = strings.Index(str[pos:], re.prefix)
+			} else {
+				advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+			}
 		}
 		if advance == -1 {
 			return nil