From aae02a1855b162bdd0a1cce7707f0bc8054834d1 Mon Sep 17 00:00:00 2001 From: Kyle Consalus Date: Wed, 2 Jun 2010 23:04:44 -0700 Subject: [PATCH] Optimization to regexp _CharClass: keep track of overall range of charclass to avoid unnecessarily iterating over ranges. Also, use the fact that IntVector is an []int to avoid method calls. On my machine, this brings us from ~27500 ns/op to ~17500 ns/op in the benchmark I've added (it is also faster in the case where a range check doesn't help, added a benchmark for this too.) I'd also like to propose that "[]", and "[^]" be disallowed. They aren't useful as far as I can tell, they aren't widely supported, and they make reasoning about character classes a bit more complicated. R=r CC=golang-dev https://golang.org/cl/1495041 --- src/pkg/regexp/all_test.go | 36 ++++++++++++++++++++++++++++++++---- src/pkg/regexp/regexp.go | 22 ++++++++++++++++------ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 62dad3aa01..fd7ee2acb6 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -519,7 +519,7 @@ var numSubexpCases = []numSubexpCase{ func TestNumSubexp(t *testing.T) { for _, c := range numSubexpCases { - re, _ := Compile(c.input) + re := MustCompile(c.input) n := re.NumSubexp() if n != c.expected { t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) @@ -530,7 +530,7 @@ func TestNumSubexp(t *testing.T) { func BenchmarkLiteral(b *testing.B) { x := strings.Repeat("x", 50) b.StopTimer() - re, _ := Compile(x) + re := MustCompile(x) b.StartTimer() for i := 0; i < b.N; i++ { if !re.MatchString(x) { @@ -543,7 +543,35 @@ func BenchmarkLiteral(b *testing.B) { func BenchmarkNotLiteral(b *testing.B) { x := strings.Repeat("x", 49) b.StopTimer() - re, _ := Compile("^" + x) + re := MustCompile("^" + x) + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass(b *testing.B) { + b.StopTimer() + x := strings.Repeat("xxxx", 20) + "w" + re := MustCompile("[abcdw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass_InRange(b *testing.B) { + b.StopTimer() + // 'b' is betwen 'a' and 'c', so the charclass + // range checking is no help here. + x := strings.Repeat("bbbb", 20) + "c" + re := MustCompile("[ac]") b.StartTimer() for i := 0; i < b.N; i++ { if !re.MatchString(x) { @@ -556,7 +584,7 @@ func BenchmarkNotLiteral(b *testing.B) { func BenchmarkReplaceAll(b *testing.B) { x := "abcdefghijklmnopqrstuvwxyz" b.StopTimer() - re, _ := Compile("[cjrw]") + re := MustCompile("[cjrw]") b.StartTimer() for i := 0; i < b.N; i++ { re.ReplaceAllString(x, "") diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index cdd5cacdda..edf91531de 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -152,10 +152,10 @@ func newChar(char int) *_Char { type _CharClass struct { common - char int negate bool // is character class negated? ([^a-z]) // vector of int, stored pairwise: [a-z] is (a,z); x is (x,x): - ranges *vector.IntVector + ranges *vector.IntVector + cmin, cmax int } func (cclass *_CharClass) kind() int { return _CHARCLASS } @@ -180,13 +180,21 @@ func (cclass *_CharClass) addRange(a, b int) { // range is a through b inclusive cclass.ranges.Push(a) cclass.ranges.Push(b) + if a < cclass.cmin { + cclass.cmin = a + } + if b > cclass.cmax { + cclass.cmax = b + } } func (cclass *_CharClass) matches(c int) bool { - for i := 0; i < cclass.ranges.Len(); i = i + 2 { - min := cclass.ranges.At(i) - max := cclass.ranges.At(i + 1) - if min <= c && c <= max { + if c < cclass.cmin || c > cclass.cmax { + return cclass.negate + } + ranges := []int(*cclass.ranges) + for i := 0; i < len(ranges); i = i + 2 { + if ranges[i] <= c && c <= ranges[i+1] { return !cclass.negate } } @@ -196,6 +204,8 @@ func (cclass *_CharClass) matches(c int) bool { func newCharClass() *_CharClass { c := new(_CharClass) c.ranges = new(vector.IntVector) + c.cmin = 0x10FFFF + 1 // MaxRune + 1 + c.cmax = -1 return c }