From 0263103a052b7964aaa283c81550f639da436d52 Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Sat, 14 Nov 2009 12:23:24 -0800 Subject: [PATCH] move evaluation of null-matching instructions one iteration earlier. performance hit of about 20% but more intuitive results for submatches. we need a good regexp package at some point. Fixes #110. R=rsc CC=golang-dev https://golang.org/cl/152131 --- src/pkg/regexp/all_test.go | 16 +++----- src/pkg/regexp/regexp.go | 78 +++++++++++++++++++++++--------------- test/bench/timing.log | 8 ++++ 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 04453a9d5f5..a9f23d70c5c 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -66,6 +66,8 @@ var matches = []tester{ tester{`b`, "abc", vec{1, 2}}, tester{`.`, "a", vec{0, 1}}, tester{`.*`, "abcdef", vec{0, 6}}, + tester{`^`, "abcde", vec{0, 0}}, + tester{`$`, "abcde", vec{5, 5}}, tester{`^abcd$`, "abcd", vec{0, 4}}, tester{`^bcd'`, "abcdef", vec{}}, tester{`^abcd$`, "abcde", vec{}}, @@ -86,6 +88,7 @@ var matches = []tester{ tester{`((a|b|c)*(d))`, "abcd", vec{0, 4, 0, 4, 2, 3, 3, 4}}, tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}}, tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}}, + tester{`(.*).*`, "ab", vec{0, 2, 0, 2}}, } func compileTest(t *testing.T, expr string, error os.Error) *Regexp { @@ -182,12 +185,12 @@ func matchTest(t *testing.T, expr string, str string, match []int) { } m := re.MatchString(str); if m != (len(match) > 0) { - t.Errorf("MatchString failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0) + t.Errorf("MatchString failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0) } // now try bytes m = re.Match(strings.Bytes(str)); if m != (len(match) > 0) { - t.Errorf("Match failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0) + t.Errorf("Match failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0) } } @@ -377,14 +380,7 @@ var matchCases = []matchCase{ } func printStringSlice(t *testing.T, s []string) { - l := len(s); - if l == 0 { - t.Log("\t") - } else { - for i := 0; i < l; i++ { - t.Logf("\t%q", s[i]) - } - } + t.Logf("%#v", s) } func TestAllMatches(t *testing.T) { diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index ab9465b7df0..89300be96cb 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -639,15 +639,37 @@ type state struct { // Append new state to to-do list. Leftmost-longest wins so avoid // adding a state that's already active. -func addState(s []state, inst instr, match []int) []state { +func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state { + switch inst.kind() { + case _BOT: + if pos == 0 { + s = re.addState(s, inst.next(), match, pos, end) + } + return s; + case _EOT: + if pos == end { + s = re.addState(s, inst.next(), match, pos, end) + } + return s; + case _BRA: + n := inst.(*_Bra).n; + match[2*n] = pos; + s = re.addState(s, inst.next(), match, pos, end); + return s; + case _EBRA: + n := inst.(*_Ebra).n; + match[2*n+1] = pos; + s = re.addState(s, inst.next(), match, pos, end); + return s; + } index := inst.index(); l := len(s); - pos := match[0]; + begin := match[0]; // TODO: Once the state is a vector and we can do insert, have inputs always // go in order correctly and this "earlier" test is never necessary, for i := 0; i < l; i++ { if s[i].inst.index() == index && // same instruction - s[i].match[0] < pos { // earlier match already going; lefmost wins + s[i].match[0] <= begin { // earlier match already going; lefmost wins return s } } @@ -661,6 +683,19 @@ func addState(s []state, inst instr, match []int) []state { s = s[0 : l+1]; s[l].inst = inst; s[l].match = match; + if inst.kind() == _ALT { + s1 := make([]int, 2*(re.nbra+1)); + for i := 0; i < len(s1); i++ { + s1[i] = match[i] + } + s = re.addState(s, inst.(*_Alt).left, s1, pos, end); + // give other branch a copy of this match vector + s1 = make([]int, 2*(re.nbra+1)); + for i := 0; i < len(s1); i++ { + s1[i] = match[i] + } + s = re.addState(s, inst.next(), s1, pos, end); + } return s; } @@ -685,11 +720,11 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac" } match[0] = pos; - s[out] = addState(s[out], re.start.next(), match); + s[out] = re.addState(s[out], re.start.next(), match, pos, end); } in, out = out, in; // old out state is new in state s[out] = s[out][0:0]; // clear out state - if len(s[in]) == 0 { + if found && len(s[in]) == 0 { // machine has completed break } @@ -702,56 +737,38 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { c, charwidth = utf8.DecodeRune(bytes[pos:end]) } } + pos += charwidth; for i := 0; i < len(s[in]); i++ { st := s[in][i]; switch s[in][i].inst.kind() { case _BOT: - if pos == 0 { - s[in] = addState(s[in], st.inst.next(), st.match) - } case _EOT: - if pos == end { - s[in] = addState(s[in], st.inst.next(), st.match) - } case _CHAR: if c == st.inst.(*_Char).char { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) } case _CHARCLASS: if st.inst.(*_CharClass).matches(c) { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) } case _ANY: if c != endOfFile { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) } case _NOTNL: if c != endOfFile && c != '\n' { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) } case _BRA: - n := st.inst.(*_Bra).n; - st.match[2*n] = pos; - s[in] = addState(s[in], st.inst.next(), st.match); case _EBRA: - n := st.inst.(*_Ebra).n; - st.match[2*n+1] = pos; - s[in] = addState(s[in], st.inst.next(), st.match); case _ALT: - s[in] = addState(s[in], st.inst.(*_Alt).left, st.match); - // give other branch a copy of this match vector - s1 := make([]int, 2*(re.nbra+1)); - for i := 0; i < len(s1); i++ { - s1[i] = st.match[i] - } - s[in] = addState(s[in], st.inst.next(), s1); case _END: // choose leftmost longest if !found || // first st.match[0] < final.match[0] || // leftmost - (st.match[0] == final.match[0] && pos > final.match[1]) { // longest + (st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest final = st; - final.match[1] = pos; + final.match[1] = pos - charwidth; } found = true; default: @@ -759,7 +776,6 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { panic("unknown instruction in execute"); } } - pos += charwidth; } return final.match; } diff --git a/test/bench/timing.log b/test/bench/timing.log index edd40e25079..99fbba55d0f 100644 --- a/test/bench/timing.log +++ b/test/bench/timing.log @@ -398,3 +398,11 @@ nbody -n 50000000 gccgo -O2 nbody.go 121.03u 0.00s 121.04r gc nbody 30.26u 0.00s 30.27r # -65% *** gc_B nbody 30.20u 0.02s 30.22r # -72% *** + +Nov 13 2009 + +# fix bug in regexp; take performance hit. good regexps will come in time. +regex-dna 100000 + gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.94r + gc regex-dna 29.78u 0.03s 29.83r + gc_B regex-dna 32.63u 0.03s 32.74r