From 75df21ceff286419ae5eb1863e2e7936a5780247 Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Tue, 14 Oct 2008 16:32:43 -0700 Subject: [PATCH] implement matching clean up interface equality hack still needs more tests; checking in for gccgo testing R=rsc DELTA=304 (261 added, 14 deleted, 29 changed) OCL=17128 CL=17135 --- usr/r/regexp/main.go | 131 +++++++++++++++++++++++++-- usr/r/regexp/regexp.go | 196 ++++++++++++++++++++++++++++++++++------- 2 files changed, 287 insertions(+), 40 deletions(-) diff --git a/usr/r/regexp/main.go b/usr/r/regexp/main.go index d723d87a9a1..25ec07ade2f 100644 --- a/usr/r/regexp/main.go +++ b/usr/r/regexp/main.go @@ -9,14 +9,131 @@ import ( "regexp"; ) -func main() { - str := "a*b*c*"; - if sys.argc() > 1 { - str = sys.argv(1); +var good_re = []string{ + `` +, `.` +, `^.$` +, `a` +, `a*` +, `a+` +, `a?` +, `a|b` +, `a*|b*` +, `(a*|b)(c*|d)` +, `[a-z]` +, `[a-abc-c\-\]\[]` +, `[a-z]+` +, `[]` +, `[abc]` +, `[^1234]` +} + +// TODO: nice to do this with a map but we don't have an iterator +type StringError struct { + re string; + err *os.Error; +} +var bad_re = []StringError{ + StringError{ `*`, regexp.ErrBareClosure }, + StringError{ `(abc`, regexp.ErrUnmatchedLpar }, + StringError{ `abc)`, regexp.ErrUnmatchedRpar }, + StringError{ `x[a-z`, regexp.ErrUnmatchedLbkt }, + StringError{ `abc]`, regexp.ErrUnmatchedRbkt }, + StringError{ `[z-a]`, regexp.ErrBadRange }, + StringError{ `abc\`, regexp.ErrExtraneousBackslash }, + StringError{ `a**`, regexp.ErrBadClosure }, + StringError{ `a*+`, regexp.ErrBadClosure }, + StringError{ `a??`, regexp.ErrBadClosure }, + StringError{ `*`, regexp.ErrBareClosure }, + StringError{ `\x`, regexp.ErrBadBackslash } +} + +type Vec [20]int; + +type Tester struct { + re string; + text string; + match Vec; +} + +var matches = []Tester { + Tester{ ``, "", Vec{0,0, -1,-1} }, + Tester{ `a`, "a", Vec{0,1, -1,-1} }, + Tester{ `b`, "abc", Vec{1,2, -1,-1} }, + Tester{ `.`, "a", Vec{0,1, -1,-1} }, + Tester{ `.*`, "abcdef", Vec{0,6, -1,-1} }, + Tester{ `^abcd$`, "abcd", Vec{0,4, -1,-1} }, + Tester{ `^bcd'`, "abcdef", Vec{-1,-1} }, + Tester{ `^abcd$`, "abcde", Vec{-1,-1} }, + Tester{ `a+`, "baaab", Vec{1, 4, -1,-1} }, + Tester{ `a*`, "baaab", Vec{0, 0, -1,-1} } +} + +func Compile(expr string, error *os.Error) regexp.Regexp { + re, err := regexp.Compile(expr); + if err != error { + print("compiling `", expr, "`; unexpected error: ", err.String(), "\n"); + sys.exit(1); } - re, err := regexp.Compile(str); - if err != nil { - print("error: ", err.String(), "\n"); + return re +} + +func MarkedLen(m *[] int) int { + if m == nil { + return 0 + } + var i int; + for i = 0; i < len(m) && m[i] >= 0; i = i+2 { + } + return i +} + +func PrintVec(m *[] int) { + l := MarkedLen(m); + for i := 0; i < l && m[i] >= 0; i = i+2 { + print(m[i], ",", m[i+1], " ") + } +} + +func Equal(m1, m2 *[]int) bool { + l := MarkedLen(m1); + if l != MarkedLen(m2) { + return false + } + for i := 0; i < l; i++ { + if m1[i] != m2[i] { + return false + } + } + return true +} + +func Match(expr string, str string, match *[]int) { + re := Compile(expr, nil); + m := re.Execute(str); + if !Equal(m, match) { + print("failure on `", expr, "` matching `", str, "`:\n"); + PrintVec(m); + print("\nshould be:\n"); + PrintVec(match); + print("\n"); sys.exit(1); } } + +func main() { + if sys.argc() > 1 { + Compile(sys.argv(1), nil); + sys.exit(0); + } + for i := 0; i < len(good_re); i++ { + Compile(good_re[i], nil); + } + for i := 0; i < len(bad_re); i++ { + Compile(bad_re[i].re, bad_re[i].err) + } + for i := 0; i < len(matches); i++ { + t := &matches[i]; + Match(t.re, t.text, &t.match) + } +} diff --git a/usr/r/regexp/regexp.go b/usr/r/regexp/regexp.go index c491b262adb..0a6fd311382 100644 --- a/usr/r/regexp/regexp.go +++ b/usr/r/regexp/regexp.go @@ -11,6 +11,8 @@ import ( "vector"; ) +export var debug = false; + export var ErrUnimplemented = os.NewError("unimplemented"); export var ErrInternal = os.NewError("internal error"); @@ -20,7 +22,6 @@ export var ErrUnmatchedLbkt = os.NewError("unmatched '['"); export var ErrUnmatchedRbkt = os.NewError("unmatched ']'"); export var ErrBadRange = os.NewError("bad range in character class"); export var ErrExtraneousBackslash = os.NewError("extraneous backslash"); -export var ErrEmpty = os.NewError("empty subexpression or alternation"); export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)"); export var ErrBareClosure = os.NewError("closure applies to nothing"); export var ErrBadBackslash = os.NewError("illegal backslash escape"); @@ -41,10 +42,11 @@ type RE struct { error *os.Error; // compile- or run-time error; nil if OK inst *vector.Vector; start Inst; + nbra int; // number of brackets in expression, for subexpressions } const ( - START // beginning of program: indexer to start + START // beginning of program = iota; END; // end of program: success BOT; // '^' beginning of text @@ -113,8 +115,8 @@ func (eot *Eot) Print() { print("eot") } // --- CHAR a regular character type Char struct { next Inst; - char int; index int; + char int; } func (char *Char) Type() int { return CHAR } @@ -143,7 +145,7 @@ type CharClass struct { ranges *vector.Vector; } -func (cclass *CharClass) Type() int { return CHAR } +func (cclass *CharClass) Type() int { return CHARCLASS } func (cclass *CharClass) Next() Inst { return cclass.next } func (cclass *CharClass) SetNext(i Inst) { cclass.next = i } func (cclass *CharClass) Index() int { return cclass.index } @@ -170,6 +172,17 @@ func (cclass *CharClass) AddRange(a, b CClassChar) { cclass.ranges.Append(b); } +func (cclass *CharClass) Matches(c int) bool { + for i := 0; i < cclass.ranges.Len(); i = i+2 { + min := cclass.ranges.At(i).(CClassChar); + max := cclass.ranges.At(i+1).(CClassChar); + if min <= c && c <= max { + return !cclass.negate + } + } + return cclass.negate +} + func NewCharClass() *CharClass { c := new(CharClass); c.ranges = vector.New(); @@ -210,7 +223,7 @@ type Ebra struct { n int; // subexpression number } -func (ebra *Ebra) Type() int { return BRA } +func (ebra *Ebra) Type() int { return EBRA } func (ebra *Ebra) Next() Inst { return ebra.next } func (ebra *Ebra) SetNext(i Inst) { ebra.next = i } func (ebra *Ebra) Index() int { return ebra.index } @@ -259,7 +272,6 @@ func (re *RE) Add(i Inst) Inst { type Parser struct { re *RE; - nbra int; // number of brackets in expression, for subexpressions nlpar int; // number of unclosed lpars pos int; ch int; @@ -314,16 +326,6 @@ func (p *Parser) Regexp() (start, end Inst) var NULL Inst type BUGinter interface{} -// same as i == NULL. TODO: remove when 6g lets me do i == NULL -func isNULL(i Inst) bool { - return sys.BUG_intereq(i.(BUGinter), NULL.(BUGinter)) -} - -// same as i == j. TODO: remove when 6g lets me do i == j -func isEQ(i,j Inst) bool { - return sys.BUG_intereq(i.(BUGinter), j.(BUGinter)) -} - func special(c int) bool { s := `\.+*?()|[]`; for i := 0; i < len(s); i++ { @@ -437,15 +439,15 @@ func (p *Parser) Term() (start, end Inst) { } p.nlpar--; p.nextc(); - p.nbra++; bra := new(Bra); p.re.Add(bra); ebra := new(Ebra); p.re.Add(ebra); - bra.n = p.nbra; - ebra.n = p.nbra; - if isNULL(start) { - if !isNULL(end) { p.re.Error(ErrInternal) } + p.re.nbra++; // increment first so first subexpr is \1 + bra.n = p.re.nbra; + ebra.n = p.re.nbra; + if start == NULL { + if end == NULL { p.re.Error(ErrInternal) } start = ebra } else { end.SetNext(ebra); @@ -476,7 +478,7 @@ func (p *Parser) Term() (start, end Inst) { func (p *Parser) Closure() (start, end Inst) { start, end = p.Term(); - if isNULL(start) { + if start == NULL { return start, end } switch p.c() { @@ -521,13 +523,13 @@ func (p *Parser) Concatenation() (start, end Inst) { for { nstart, nend := p.Closure(); switch { - case isNULL(nstart): // end of this concatenation - if isNULL(start) { // this is the empty string + case nstart == NULL: // end of this concatenation + if start == NULL { // this is the empty string nop := p.re.Add(new(Nop)); return nop, nop; } return start, end; - case isNULL(start): // this is first element of concatenation + case start == NULL: // this is first element of concatenation start, end = nstart, nend; default: end.SetNext(nstart); @@ -602,17 +604,20 @@ func (re *RE) DoParse() { re.start = start; e.SetNext(re.Add(new(End))); - re.Dump(); - println(); + if debug { + re.Dump(); + println(); + } re.EliminateNops(); - re.Dump(); - println(); - - re.Error(ErrUnimplemented); + if debug { + re.Dump(); + println(); + } } + func Compiler(str string, ch *chan *RE) { re := new(RE); re.expr = str; @@ -624,13 +629,138 @@ func Compiler(str string, ch *chan *RE) { // Public interface has only execute functionality (not yet implemented) export type Regexp interface { - // Execute() bool + Execute(s string) *[]int } -// compile in separate goroutine; wait for result +// Compile in separate goroutine; wait for result export func Compile(str string) (regexp Regexp, error *os.Error) { ch := new(chan *RE); go Compiler(str, ch); re := <-ch; return re, re.error } + +type State struct { + inst Inst; // next instruction to execute + match *[]int; // pairs of bracketing submatches. 0th is start,end +} + +// Append new state to to-do list. Leftmost-longest wins so avoid +// adding a state that's already active. +func AddState(s *[]State, inst Inst, match *[]int) *[]State { + index := inst.Index(); + l := len(s); + pos := match[0]; + // TODO: Once the state is a vector and we can do insert, have inputs always + // go in order correctly and this "earlier" test is never necessary, + for i := 0; i < l; i++ { + if s[i].inst.Index() == index && // same instruction + s[i].match[0] < pos { // earlier match already going; lefmost wins + return s + } + } + if l == cap(s) { + s1 := new([]State, 2*l)[0:l]; + for i := 0; i < l; i++ { + s1[i] = s[i]; + } + s = s1; + } + s = s[0:l+1]; + s[l].inst = inst; + s[l].match = match; + return s; +} + +func (re *RE) DoExecute(str string, pos int) *[]int { + var s [2]*[]State; // TODO: use a vector when State values (not ptrs) can be vector elements + s[0] = new([]State, 10)[0:0]; + s[1] = new([]State, 10)[0:0]; + in, out := 0, 1; + var final State; + found := false; + for pos <= len(str) { + if !found { + // prime the pump if we haven't seen a match yet + match := new([]int, 2*(re.nbra+1)); + match[0] = pos; + s[out] = AddState(s[out], re.start.Next(), match); + } + in, out = out, in; // old out state is new in state + s[out] = s[out][0:0]; // clear out state + if len(s[in]) == 0 { + // machine has completed + break; + } + c := EOF; + if pos < len(str) { + c = int(str[pos]) + } +//println("position ", pos, "char", string(c), "in", in, "out", out, "len in", len(s[in])); + for i := 0; i < len(s[in]); i++ { + state := s[in][i]; +//state.inst.Print(); print("\n"); + switch s[in][i].inst.Type() { + case BOT: + if pos == 0 { + s[in] = AddState(s[in], state.inst.Next(), state.match) + } + case EOT: + if pos == len(str) { + s[in] = AddState(s[in], state.inst.Next(), state.match) + } + case CHAR: + if c == state.inst.(*Char).char { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case CHARCLASS: + if state.inst.(*CharClass).Matches(c) { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case ANY: + if c != EOF { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case BRA: + n := state.inst.(*Bra).n; + state.match[2*n] = pos; + s[in] = AddState(s[in], state.inst.Next(), state.match); + case EBRA: + n := state.inst.(*Ebra).n; + state.match[2*n+1] = pos; + s[in] = AddState(s[in], state.inst.Next(), state.match); + case ALT: + s[in] = AddState(s[in], state.inst.(*Alt).left, state.match); + // give other branch a copy of this match vector + s1 := new([]int, 2*(re.nbra+1)); + for i := 0; i < len(s1); i++ { + s1[i] = state.match[i] + } + s[in] = AddState(s[in], state.inst.Next(), s1); + case END: + // choose leftmost longest + if !found || // first + state.match[0] < final.match[0] || // leftmost + (state.match[0] == final.match[0] && pos > final.match[1]) { // longest + final = state; + final.match[1] = pos; + } + found = true; + default: + state.inst.Print(); + panic("unknown instruction in execute"); + } + } + pos++; + } + if !found { + return nil + } +//if found { println("found: from ", final.match[0], "to", final.match[1] )} + return final.match; +} + + +func (re *RE) Execute(s string) *[]int { + return re.DoExecute(s, 0) +}