1
0
mirror of https://github.com/golang/go synced 2024-11-25 05:47:57 -07:00

move evaluation of null-matching instructions one iteration earlier.

performance hit of about 20% but more intuitive results for submatches.
we need a good regexp package at some point.

Fixes #110.

R=rsc
CC=golang-dev
https://golang.org/cl/152131
This commit is contained in:
Rob Pike 2009-11-14 12:23:24 -08:00
parent bad9738be6
commit 0263103a05
3 changed files with 61 additions and 41 deletions

View File

@ -66,6 +66,8 @@ var matches = []tester{
tester{`b`, "abc", vec{1, 2}},
tester{`.`, "a", vec{0, 1}},
tester{`.*`, "abcdef", vec{0, 6}},
tester{`^`, "abcde", vec{0, 0}},
tester{`$`, "abcde", vec{5, 5}},
tester{`^abcd$`, "abcd", vec{0, 4}},
tester{`^bcd'`, "abcdef", vec{}},
tester{`^abcd$`, "abcde", vec{}},
@ -86,6 +88,7 @@ var matches = []tester{
tester{`((a|b|c)*(d))`, "abcd", vec{0, 4, 0, 4, 2, 3, 3, 4}},
tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}},
tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}},
tester{`(.*).*`, "ab", vec{0, 2, 0, 2}},
}
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
@ -182,12 +185,12 @@ func matchTest(t *testing.T, expr string, str string, match []int) {
}
m := re.MatchString(str);
if m != (len(match) > 0) {
t.Errorf("MatchString failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
t.Errorf("MatchString failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
}
// now try bytes
m = re.Match(strings.Bytes(str));
if m != (len(match) > 0) {
t.Errorf("Match failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
t.Errorf("Match failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
}
}
@ -377,14 +380,7 @@ var matchCases = []matchCase{
}
func printStringSlice(t *testing.T, s []string) {
l := len(s);
if l == 0 {
t.Log("\t<empty>")
} else {
for i := 0; i < l; i++ {
t.Logf("\t%q", s[i])
}
}
t.Logf("%#v", s)
}
func TestAllMatches(t *testing.T) {

View File

@ -639,15 +639,37 @@ type state struct {
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
func addState(s []state, inst instr, match []int) []state {
func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
switch inst.kind() {
case _BOT:
if pos == 0 {
s = re.addState(s, inst.next(), match, pos, end)
}
return s;
case _EOT:
if pos == end {
s = re.addState(s, inst.next(), match, pos, end)
}
return s;
case _BRA:
n := inst.(*_Bra).n;
match[2*n] = pos;
s = re.addState(s, inst.next(), match, pos, end);
return s;
case _EBRA:
n := inst.(*_Ebra).n;
match[2*n+1] = pos;
s = re.addState(s, inst.next(), match, pos, end);
return s;
}
index := inst.index();
l := len(s);
pos := match[0];
begin := match[0];
// TODO: Once the state is a vector and we can do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary,
for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction
s[i].match[0] < pos { // earlier match already going; lefmost wins
s[i].match[0] <= begin { // earlier match already going; lefmost wins
return s
}
}
@ -661,6 +683,19 @@ func addState(s []state, inst instr, match []int) []state {
s = s[0 : l+1];
s[l].inst = inst;
s[l].match = match;
if inst.kind() == _ALT {
s1 := make([]int, 2*(re.nbra+1));
for i := 0; i < len(s1); i++ {
s1[i] = match[i]
}
s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
// give other branch a copy of this match vector
s1 = make([]int, 2*(re.nbra+1));
for i := 0; i < len(s1); i++ {
s1[i] = match[i]
}
s = re.addState(s, inst.next(), s1, pos, end);
}
return s;
}
@ -685,11 +720,11 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos;
s[out] = addState(s[out], re.start.next(), match);
s[out] = re.addState(s[out], re.start.next(), match, pos, end);
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
if len(s[in]) == 0 {
if found && len(s[in]) == 0 {
// machine has completed
break
}
@ -702,56 +737,38 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
c, charwidth = utf8.DecodeRune(bytes[pos:end])
}
}
pos += charwidth;
for i := 0; i < len(s[in]); i++ {
st := s[in][i];
switch s[in][i].inst.kind() {
case _BOT:
if pos == 0 {
s[in] = addState(s[in], st.inst.next(), st.match)
}
case _EOT:
if pos == end {
s[in] = addState(s[in], st.inst.next(), st.match)
}
case _CHAR:
if c == st.inst.(*_Char).char {
s[out] = addState(s[out], st.inst.next(), st.match)
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) {
s[out] = addState(s[out], st.inst.next(), st.match)
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _ANY:
if c != endOfFile {
s[out] = addState(s[out], st.inst.next(), st.match)
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _NOTNL:
if c != endOfFile && c != '\n' {
s[out] = addState(s[out], st.inst.next(), st.match)
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _BRA:
n := st.inst.(*_Bra).n;
st.match[2*n] = pos;
s[in] = addState(s[in], st.inst.next(), st.match);
case _EBRA:
n := st.inst.(*_Ebra).n;
st.match[2*n+1] = pos;
s[in] = addState(s[in], st.inst.next(), st.match);
case _ALT:
s[in] = addState(s[in], st.inst.(*_Alt).left, st.match);
// give other branch a copy of this match vector
s1 := make([]int, 2*(re.nbra+1));
for i := 0; i < len(s1); i++ {
s1[i] = st.match[i]
}
s[in] = addState(s[in], st.inst.next(), s1);
case _END:
// choose leftmost longest
if !found || // first
st.match[0] < final.match[0] || // leftmost
(st.match[0] == final.match[0] && pos > final.match[1]) { // longest
(st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
final = st;
final.match[1] = pos;
final.match[1] = pos - charwidth;
}
found = true;
default:
@ -759,7 +776,6 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
panic("unknown instruction in execute");
}
}
pos += charwidth;
}
return final.match;
}

View File

@ -398,3 +398,11 @@ nbody -n 50000000
gccgo -O2 nbody.go 121.03u 0.00s 121.04r
gc nbody 30.26u 0.00s 30.27r # -65% ***
gc_B nbody 30.20u 0.02s 30.22r # -72% ***
Nov 13 2009
# fix bug in regexp; take performance hit. good regexps will come in time.
regex-dna 100000
gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.94r
gc regex-dna 29.78u 0.03s 29.83r
gc_B regex-dna 32.63u 0.03s 32.74r