mirror of
https://github.com/golang/go
synced 2024-11-22 01:04:40 -07:00
add a match arena to regexp to avoid generating garbage.
simple regexps run 20x faster. the regex-dna benchmark goes 3x faster. R=rsc CC=golang-dev https://golang.org/cl/156108
This commit is contained in:
parent
b30f753dc3
commit
c62069cc1f
@ -60,6 +60,7 @@ type tester struct {
|
||||
}
|
||||
|
||||
var matches = []tester{
|
||||
tester{`^abcdefg`, "abcdefg", vec{0, 7}},
|
||||
tester{`a+`, "baaab", vec{1, 4}},
|
||||
tester{"abcd..", "abcdef", vec{0, 6}},
|
||||
tester{``, "", vec{0, 0}},
|
||||
@ -450,3 +451,29 @@ func TestAllMatches(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLiteral(b *testing.B) {
|
||||
x := strings.Repeat("x", 50);
|
||||
b.StopTimer();
|
||||
re, _ := Compile(x);
|
||||
b.StartTimer();
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNotLiteral(b *testing.B) {
|
||||
x := strings.Repeat("x", 49);
|
||||
b.StopTimer();
|
||||
re, _ := Compile("^" + x);
|
||||
b.StartTimer();
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error {
|
||||
return p.error;
|
||||
}
|
||||
|
||||
// return regular text at the beginning of str
|
||||
// Extract regular text from the beginning of the pattern.
|
||||
// That text can be used by doExecute to speed up matching.
|
||||
func (re *Regexp) setPrefix() {
|
||||
var b []byte;
|
||||
var utf = make([]byte, utf8.UTFMax);
|
||||
@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp {
|
||||
return regexp;
|
||||
}
|
||||
|
||||
// The match arena allows us to reduce the garbage generated by tossing
|
||||
// match vectors away as we execute. Matches are ref counted and returned
|
||||
// to a free list when no longer active. Increases a simple benchmark by 22X.
|
||||
type matchArena struct {
|
||||
head *matchVec;
|
||||
len int; // length of match vector
|
||||
}
|
||||
|
||||
type matchVec struct {
|
||||
m []int; // pairs of bracketing submatches. 0th is start,end
|
||||
ref int;
|
||||
next *matchVec;
|
||||
}
|
||||
|
||||
func (a *matchArena) new() *matchVec {
|
||||
if a.head == nil {
|
||||
const N = 10;
|
||||
block := make([]matchVec, N);
|
||||
for i := 0; i < N; i++ {
|
||||
b := &block[i];
|
||||
b.next = a.head;
|
||||
a.head = b;
|
||||
}
|
||||
}
|
||||
m := a.head;
|
||||
a.head = m.next;
|
||||
m.ref = 0;
|
||||
if m.m == nil {
|
||||
m.m = make([]int, a.len)
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
func (a *matchArena) free(m *matchVec) {
|
||||
m.ref--;
|
||||
if m.ref == 0 {
|
||||
m.next = a.head;
|
||||
a.head = m;
|
||||
}
|
||||
}
|
||||
|
||||
func (a *matchArena) copy(m *matchVec) *matchVec {
|
||||
m1 := a.new();
|
||||
copy(m1.m, m.m);
|
||||
return m1;
|
||||
}
|
||||
|
||||
func (a *matchArena) noMatch() *matchVec {
|
||||
m := a.new();
|
||||
for i := range m.m {
|
||||
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
|
||||
}
|
||||
m.ref = 1;
|
||||
return m;
|
||||
}
|
||||
|
||||
type state struct {
|
||||
inst instr; // next instruction to execute
|
||||
match []int; // pairs of bracketing submatches. 0th is start,end
|
||||
match *matchVec;
|
||||
}
|
||||
|
||||
// Append new state to to-do list. Leftmost-longest wins so avoid
|
||||
// adding a state that's already active.
|
||||
func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
|
||||
// adding a state that's already active. The matchVec will be inc-ref'ed
|
||||
// if it is assigned to a state.
|
||||
func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state {
|
||||
switch inst.kind() {
|
||||
case _BOT:
|
||||
if pos == 0 {
|
||||
s = re.addState(s, inst.next(), match, pos, end)
|
||||
s = a.addState(s, inst.next(), match, pos, end)
|
||||
}
|
||||
return s;
|
||||
case _EOT:
|
||||
if pos == end {
|
||||
s = re.addState(s, inst.next(), match, pos, end)
|
||||
s = a.addState(s, inst.next(), match, pos, end)
|
||||
}
|
||||
return s;
|
||||
case _BRA:
|
||||
n := inst.(*_Bra).n;
|
||||
match[2*n] = pos;
|
||||
s = re.addState(s, inst.next(), match, pos, end);
|
||||
match.m[2*n] = pos;
|
||||
s = a.addState(s, inst.next(), match, pos, end);
|
||||
return s;
|
||||
case _EBRA:
|
||||
n := inst.(*_Ebra).n;
|
||||
match[2*n+1] = pos;
|
||||
s = re.addState(s, inst.next(), match, pos, end);
|
||||
match.m[2*n+1] = pos;
|
||||
s = a.addState(s, inst.next(), match, pos, end);
|
||||
return s;
|
||||
}
|
||||
index := inst.index();
|
||||
l := len(s);
|
||||
begin := match[0];
|
||||
// TODO: Once the state is a vector and we can do insert, have inputs always
|
||||
// go in order correctly and this "earlier" test is never necessary,
|
||||
begin := match.m[0];
|
||||
// TODO: If the state were a vector and we could do insert, have inputs always
|
||||
// go in order correctly and this "earlier" test is not necessary,
|
||||
for i := 0; i < l; i++ {
|
||||
if s[i].inst.index() == index && // same instruction
|
||||
s[i].match[0] <= begin { // earlier match already going; lefmost wins
|
||||
s[i].match.m[0] <= begin { // earlier match already going; lefmost wins
|
||||
return s
|
||||
}
|
||||
}
|
||||
@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
|
||||
s = s[0 : l+1];
|
||||
s[l].inst = inst;
|
||||
s[l].match = match;
|
||||
match.ref++;
|
||||
if inst.kind() == _ALT {
|
||||
s1 := make([]int, 2*(re.nbra+1));
|
||||
copy(s1, match);
|
||||
s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
|
||||
s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end);
|
||||
// give other branch a copy of this match vector
|
||||
s1 = make([]int, 2*(re.nbra+1));
|
||||
copy(s1, match);
|
||||
s = re.addState(s, inst.next(), s1, pos, end);
|
||||
s = a.addState(s, inst.next(), a.copy(match), pos, end);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
func noMatch(nbra int) []int {
|
||||
match := make([]int, 2*(nbra+1));
|
||||
for i := range match {
|
||||
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
|
||||
}
|
||||
return match;
|
||||
}
|
||||
|
||||
// Accepts either string or bytes - the logic is identical either way.
|
||||
// If bytes == nil, scan str.
|
||||
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
|
||||
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
|
||||
var s [2][]state;
|
||||
s[0] = make([]state, 10)[0:0];
|
||||
s[1] = make([]state, 10)[0:0];
|
||||
in, out := 0, 1;
|
||||
@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
|
||||
}
|
||||
pos += advance + len(re.prefix);
|
||||
}
|
||||
arena := &matchArena{nil, 2 * (re.nbra + 1)};
|
||||
for pos <= end {
|
||||
if !found {
|
||||
// prime the pump if we haven't seen a match yet
|
||||
match := noMatch(re.nbra);
|
||||
match[0] = pos;
|
||||
s[out] = re.addState(s[out], re.start.next(), match, pos, end);
|
||||
match := arena.noMatch();
|
||||
match.m[0] = pos;
|
||||
s[out] = arena.addState(s[out], re.start.next(), match, pos, end);
|
||||
arena.free(match); // if addState saved it, ref was incremented
|
||||
}
|
||||
in, out = out, in; // old out state is new in state
|
||||
s[out] = s[out][0:0]; // clear out state
|
||||
// clear out old state
|
||||
old := s[out];
|
||||
for _, state := range old {
|
||||
arena.free(state.match)
|
||||
}
|
||||
s[out] = old[0:0]; // truncate state vector
|
||||
if found && len(s[in]) == 0 {
|
||||
// machine has completed
|
||||
break
|
||||
@ -791,26 +845,25 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
|
||||
}
|
||||
}
|
||||
pos += charwidth;
|
||||
for i := 0; i < len(s[in]); i++ {
|
||||
st := s[in][i];
|
||||
switch s[in][i].inst.kind() {
|
||||
for _, st := range s[in] {
|
||||
switch st.inst.kind() {
|
||||
case _BOT:
|
||||
case _EOT:
|
||||
case _CHAR:
|
||||
if c == st.inst.(*_Char).char {
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _CHARCLASS:
|
||||
if st.inst.(*_CharClass).matches(c) {
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _ANY:
|
||||
if c != endOfFile {
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _NOTNL:
|
||||
if c != endOfFile && c != '\n' {
|
||||
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
|
||||
}
|
||||
case _BRA:
|
||||
case _EBRA:
|
||||
@ -818,10 +871,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
|
||||
case _END:
|
||||
// choose leftmost longest
|
||||
if !found || // first
|
||||
st.match[0] < final.match[0] || // leftmost
|
||||
(st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
|
||||
st.match.m[0] < final.match.m[0] || // leftmost
|
||||
(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
|
||||
if final.match != nil {
|
||||
arena.free(final.match)
|
||||
}
|
||||
final = st;
|
||||
final.match[1] = pos - charwidth;
|
||||
final.match.ref++;
|
||||
final.match.m[1] = pos - charwidth;
|
||||
}
|
||||
found = true;
|
||||
default:
|
||||
@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
|
||||
}
|
||||
}
|
||||
}
|
||||
// if match found, back up start of match by width of prefix.
|
||||
if re.prefix != "" && len(final.match) > 0 {
|
||||
final.match[0] -= len(re.prefix)
|
||||
if final.match == nil {
|
||||
return nil
|
||||
}
|
||||
return final.match;
|
||||
// if match found, back up start of match by width of prefix.
|
||||
if re.prefix != "" && len(final.match.m) > 0 {
|
||||
final.match.m[0] -= len(re.prefix)
|
||||
}
|
||||
return final.match.m;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user