mirror of
https://github.com/golang/go
synced 2024-11-25 03:07:56 -07:00
move evaluation of null-matching instructions one iteration earlier.
performance hit of about 20% but more intuitive results for submatches. we need a good regexp package at some point. Fixes #110. R=rsc CC=golang-dev https://golang.org/cl/152131
This commit is contained in:
parent
bad9738be6
commit
0263103a05
@ -66,6 +66,8 @@ var matches = []tester{
|
||||
tester{`b`, "abc", vec{1, 2}},
|
||||
tester{`.`, "a", vec{0, 1}},
|
||||
tester{`.*`, "abcdef", vec{0, 6}},
|
||||
tester{`^`, "abcde", vec{0, 0}},
|
||||
tester{`$`, "abcde", vec{5, 5}},
|
||||
tester{`^abcd$`, "abcd", vec{0, 4}},
|
||||
tester{`^bcd'`, "abcdef", vec{}},
|
||||
tester{`^abcd$`, "abcde", vec{}},
|
||||
@ -86,6 +88,7 @@ var matches = []tester{
|
||||
tester{`((a|b|c)*(d))`, "abcd", vec{0, 4, 0, 4, 2, 3, 3, 4}},
|
||||
tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}},
|
||||
tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}},
|
||||
tester{`(.*).*`, "ab", vec{0, 2, 0, 2}},
|
||||
}
|
||||
|
||||
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
|
||||
@ -182,12 +185,12 @@ func matchTest(t *testing.T, expr string, str string, match []int) {
|
||||
}
|
||||
m := re.MatchString(str);
|
||||
if m != (len(match) > 0) {
|
||||
t.Errorf("MatchString failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
|
||||
t.Errorf("MatchString failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
|
||||
}
|
||||
// now try bytes
|
||||
m = re.Match(strings.Bytes(str));
|
||||
if m != (len(match) > 0) {
|
||||
t.Errorf("Match failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
|
||||
t.Errorf("Match failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
|
||||
}
|
||||
}
|
||||
|
||||
@ -377,14 +380,7 @@ var matchCases = []matchCase{
|
||||
}
|
||||
|
||||
func printStringSlice(t *testing.T, s []string) {
|
||||
l := len(s);
|
||||
if l == 0 {
|
||||
t.Log("\t<empty>")
|
||||
} else {
|
||||
for i := 0; i < l; i++ {
|
||||
t.Logf("\t%q", s[i])
|
||||
}
|
||||
}
|
||||
t.Logf("%#v", s)
|
||||
}
|
||||
|
||||
func TestAllMatches(t *testing.T) {
|
||||
|
@ -639,15 +639,37 @@ type state struct {
|
||||
|
||||
// Append new state to to-do list. Leftmost-longest wins so avoid
|
||||
// adding a state that's already active.
|
||||
func addState(s []state, inst instr, match []int) []state {
|
||||
func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
|
||||
switch inst.kind() {
|
||||
case _BOT:
|
||||
if pos == 0 {
|
||||
s = re.addState(s, inst.next(), match, pos, end)
|
||||
}
|
||||
return s;
|
||||
case _EOT:
|
||||
if pos == end {
|
||||
s = re.addState(s, inst.next(), match, pos, end)
|
||||
}
|
||||
return s;
|
||||
case _BRA:
|
||||
n := inst.(*_Bra).n;
|
||||
match[2*n] = pos;
|
||||
s = re.addState(s, inst.next(), match, pos, end);
|
||||
return s;
|
||||
case _EBRA:
|
||||
n := inst.(*_Ebra).n;
|
||||
match[2*n+1] = pos;
|
||||
s = re.addState(s, inst.next(), match, pos, end);
|
||||
return s;
|
||||
}
|
||||
index := inst.index();
|
||||
l := len(s);
|
||||
pos := match[0];
|
||||
begin := match[0];
|
||||
// TODO: Once the state is a vector and we can do insert, have inputs always
|
||||
// go in order correctly and this "earlier" test is never necessary,
|
||||
for i := 0; i < l; i++ {
|
||||
if s[i].inst.index() == index && // same instruction
|
||||
s[i].match[0] < pos { // earlier match already going; lefmost wins
|
||||
s[i].match[0] <= begin { // earlier match already going; lefmost wins
|
||||
return s
|
||||
}
|
||||
}
|
||||
@ -661,6 +683,19 @@ func addState(s []state, inst instr, match []int) []state {
|
||||
s = s[0 : l+1];
|
||||
s[l].inst = inst;
|
||||
s[l].match = match;
|
||||
if inst.kind() == _ALT {
|
||||
s1 := make([]int, 2*(re.nbra+1));
|
||||
for i := 0; i < len(s1); i++ {
|
||||
s1[i] = match[i]
|
||||
}
|
||||
s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
|
||||
// give other branch a copy of this match vector
|
||||
s1 = make([]int, 2*(re.nbra+1));
|
||||
for i := 0; i < len(s1); i++ {
|
||||
s1[i] = match[i]
|
||||
}
|
||||
s = re.addState(s, inst.next(), s1, pos, end);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -685,11 +720,11 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
|
||||
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
|
||||
}
|
||||
match[0] = pos;
|
||||
s[out] = addState(s[out], re.start.next(), match);
|
||||
s[out] = re.addState(s[out], re.start.next(), match, pos, end);
|
||||
}
|
||||
in, out = out, in; // old out state is new in state
|
||||
s[out] = s[out][0:0]; // clear out state
|
||||
if len(s[in]) == 0 {
|
||||
if found && len(s[in]) == 0 {
|
||||
// machine has completed
|
||||
break
|
||||
}
|
||||
@ -702,56 +737,38 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
|
||||
c, charwidth = utf8.DecodeRune(bytes[pos:end])
|
||||
}
|
||||
}
|
||||
pos += charwidth;
|
||||
for i := 0; i < len(s[in]); i++ {
|
||||
st := s[in][i];
|
||||
switch s[in][i].inst.kind() {
|
||||
case _BOT:
|
||||
if pos == 0 {
|
||||
s[in] = addState(s[in], st.inst.next(), st.match)
|
||||
}
|
||||
case _EOT:
|
||||
if pos == end {
|
||||
s[in] = addState(s[in], st.inst.next(), st.match)
|
||||
}
|
||||
case _CHAR:
|
||||
if c == st.inst.(*_Char).char {
|
||||
s[out] = addState(s[out], st.inst.next(), st.match)
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _CHARCLASS:
|
||||
if st.inst.(*_CharClass).matches(c) {
|
||||
s[out] = addState(s[out], st.inst.next(), st.match)
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _ANY:
|
||||
if c != endOfFile {
|
||||
s[out] = addState(s[out], st.inst.next(), st.match)
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _NOTNL:
|
||||
if c != endOfFile && c != '\n' {
|
||||
s[out] = addState(s[out], st.inst.next(), st.match)
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _BRA:
|
||||
n := st.inst.(*_Bra).n;
|
||||
st.match[2*n] = pos;
|
||||
s[in] = addState(s[in], st.inst.next(), st.match);
|
||||
case _EBRA:
|
||||
n := st.inst.(*_Ebra).n;
|
||||
st.match[2*n+1] = pos;
|
||||
s[in] = addState(s[in], st.inst.next(), st.match);
|
||||
case _ALT:
|
||||
s[in] = addState(s[in], st.inst.(*_Alt).left, st.match);
|
||||
// give other branch a copy of this match vector
|
||||
s1 := make([]int, 2*(re.nbra+1));
|
||||
for i := 0; i < len(s1); i++ {
|
||||
s1[i] = st.match[i]
|
||||
}
|
||||
s[in] = addState(s[in], st.inst.next(), s1);
|
||||
case _END:
|
||||
// choose leftmost longest
|
||||
if !found || // first
|
||||
st.match[0] < final.match[0] || // leftmost
|
||||
(st.match[0] == final.match[0] && pos > final.match[1]) { // longest
|
||||
(st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
|
||||
final = st;
|
||||
final.match[1] = pos;
|
||||
final.match[1] = pos - charwidth;
|
||||
}
|
||||
found = true;
|
||||
default:
|
||||
@ -759,7 +776,6 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
|
||||
panic("unknown instruction in execute");
|
||||
}
|
||||
}
|
||||
pos += charwidth;
|
||||
}
|
||||
return final.match;
|
||||
}
|
||||
|
@ -398,3 +398,11 @@ nbody -n 50000000
|
||||
gccgo -O2 nbody.go 121.03u 0.00s 121.04r
|
||||
gc nbody 30.26u 0.00s 30.27r # -65% ***
|
||||
gc_B nbody 30.20u 0.02s 30.22r # -72% ***
|
||||
|
||||
Nov 13 2009
|
||||
|
||||
# fix bug in regexp; take performance hit. good regexps will come in time.
|
||||
regex-dna 100000
|
||||
gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.94r
|
||||
gc regex-dna 29.78u 0.03s 29.83r
|
||||
gc_B regex-dna 32.63u 0.03s 32.74r
|
||||
|
Loading…
Reference in New Issue
Block a user