From 68e250516c472fb7bc16e896fe9ea0c49c0dc4a2 Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Tue, 11 Aug 2009 13:54:47 -0700 Subject: [PATCH] make a simpler regexp implementation with fewer dependencies and put it inside testing. remove "regexp." from regexp tests. R=rsc DELTA=1173 (1152 added, 1 deleted, 20 changed) OCL=33028 CL=33037 --- src/pkg/regexp/all_test.go | 28 +- src/pkg/testing/Makefile | 15 +- src/pkg/testing/regexp.go | 863 +++++++++++++++++++++++++++++++++ src/pkg/testing/regexp_test.go | 281 +++++++++++ src/pkg/testing/testing.go | 8 +- 5 files changed, 1174 insertions(+), 21 deletions(-) create mode 100644 src/pkg/testing/regexp.go create mode 100644 src/pkg/testing/regexp_test.go diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 72355e91b24..c985dad429a 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -37,18 +37,18 @@ type stringError struct { err os.Error; } var bad_re = []stringError{ - stringError{ `*`, regexp.ErrBareClosure }, - stringError{ `(abc`, regexp.ErrUnmatchedLpar }, - stringError{ `abc)`, regexp.ErrUnmatchedRpar }, - stringError{ `x[a-z`, regexp.ErrUnmatchedLbkt }, - stringError{ `abc]`, regexp.ErrUnmatchedRbkt }, - stringError{ `[z-a]`, regexp.ErrBadRange }, - stringError{ `abc\`, regexp.ErrExtraneousBackslash }, - stringError{ `a**`, regexp.ErrBadClosure }, - stringError{ `a*+`, regexp.ErrBadClosure }, - stringError{ `a??`, regexp.ErrBadClosure }, - stringError{ `*`, regexp.ErrBareClosure }, - stringError{ `\x`, regexp.ErrBadBackslash }, + stringError{ `*`, ErrBareClosure }, + stringError{ `(abc`, ErrUnmatchedLpar }, + stringError{ `abc)`, ErrUnmatchedRpar }, + stringError{ `x[a-z`, ErrUnmatchedLbkt }, + stringError{ `abc]`, ErrUnmatchedRbkt }, + stringError{ `[z-a]`, ErrBadRange }, + stringError{ `abc\`, ErrExtraneousBackslash }, + stringError{ `a**`, ErrBadClosure }, + stringError{ `a*+`, ErrBadClosure }, + stringError{ `a??`, ErrBadClosure }, + stringError{ `*`, ErrBareClosure }, + stringError{ `\x`, ErrBadBackslash }, } type vec []int; @@ -88,8 +88,8 @@ var matches = []tester { tester{ `a*(|(b))c*`, "aacc", vec{0,4, 2,2, -1,-1} }, } -func compileTest(t *testing.T, expr string, error os.Error) *regexp.Regexp { - re, err := regexp.Compile(expr); +func compileTest(t *testing.T, expr string, error os.Error) *Regexp { + re, err := Compile(expr); if err != error { t.Error("compiling `", expr, "`; unexpected error: ", err.String()); } diff --git a/src/pkg/testing/Makefile b/src/pkg/testing/Makefile index 493eba6f28d..eab8a4cf0ee 100644 --- a/src/pkg/testing/Makefile +++ b/src/pkg/testing/Makefile @@ -2,6 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. + # DO NOT EDIT. Automatically generated by gobuild. # gobuild -m >Makefile @@ -20,7 +21,7 @@ test: packages coverage: packages gotest - 6cov -g `pwd` | grep -v '_test\.go:' + 6cov -g $$(pwd) | grep -v '_test\.go:' %.$O: %.go $(GC) -I_obj $*.go @@ -32,16 +33,23 @@ coverage: packages $(AS) $*.s O1=\ + regexp.$O\ + +O2=\ testing.$O\ -phases: a1 +phases: a1 a2 _obj$D/testing.a: phases a1: $(O1) - $(AR) grc _obj$D/testing.a testing.$O + $(AR) grc _obj$D/testing.a regexp.$O rm -f $(O1) +a2: $(O2) + $(AR) grc _obj$D/testing.a testing.$O + rm -f $(O2) + newpkg: clean mkdir -p _obj$D @@ -49,6 +57,7 @@ newpkg: clean $(O1): newpkg $(O2): a1 +$(O3): a2 nuke: clean rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/testing.a diff --git a/src/pkg/testing/regexp.go b/src/pkg/testing/regexp.go new file mode 100644 index 00000000000..ae698264f1f --- /dev/null +++ b/src/pkg/testing/regexp.go @@ -0,0 +1,863 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The testing package implements a simple regular expression library. +// It is a reduced version of the regular expression package suitable +// for use in tests; it avoids many dependencies. +// +// The syntax of the regular expressions accepted is: +// +// regexp: +// concatenation { '|' concatenation } +// concatenation: +// { closure } +// closure: +// term [ '*' | '+' | '?' ] +// term: +// '^' +// '$' +// '.' +// character +// '[' [ '^' ] character-ranges ']' +// '(' regexp ')' +// +package testing + +import ( + "runtime"; + "utf8"; +) + +var debug = false; + +// Error codes returned by failures to parse an expression. +var ( + ErrInternal = "internal error"; + ErrUnmatchedLpar = "unmatched ''"; + ErrUnmatchedRpar = "unmatched ''"; + ErrUnmatchedLbkt = "unmatched '['"; + ErrUnmatchedRbkt = "unmatched ']'"; + ErrBadRange = "bad range in character class"; + ErrExtraneousBackslash = "extraneous backslash"; + ErrBadClosure = "repeated closure **, ++, etc."; + ErrBareClosure = "closure applies to nothing"; + ErrBadBackslash = "illegal backslash escape"; +) + +// An instruction executed by the NFA +type instr interface { + kind() int; // the type of this instruction: _CHAR, _ANY, etc. + next() instr; // the instruction to execute after this one + setNext(i instr); + index() int; + setIndex(i int); + print(); +} + +// Fields and methods common to all instructions +type common struct { + _next instr; + _index int; +} + +func (c *common) next() instr { return c._next } +func (c *common) setNext(i instr) { c._next = i } +func (c *common) index() int { return c._index } +func (c *common) setIndex(i int) { c._index = i } + +// The representation of a compiled regular expression. +// The public interface is entirely through methods. +type Regexp struct { + expr string; // the original expression + ch chan<- *Regexp; // reply channel when we're done + error string; // compile- or run-time error; nil if OK + inst []instr; + start instr; + nbra int; // number of brackets in expression, for subexpressions +} + +const ( + _START // beginning of program + = iota; + _END; // end of program: success + _BOT; // '^' beginning of text + _EOT; // '$' end of text + _CHAR; // 'a' regular character + _CHARCLASS; // [a-z] character class + _ANY; // '.' any character including newline + _NOTNL; // [^\n] special case: any character but newline + _BRA; // '(' parenthesized expression + _EBRA; // ')'; end of '(' parenthesized expression + _ALT; // '|' alternation + _NOP; // do nothing; makes it easy to link without patching +) + +// --- START start of program +type _Start struct { + common +} + +func (start *_Start) kind() int { return _START } +func (start *_Start) print() { print("start") } + +// --- END end of program +type _End struct { + common +} + +func (end *_End) kind() int { return _END } +func (end *_End) print() { print("end") } + +// --- BOT beginning of text +type _Bot struct { + common +} + +func (bot *_Bot) kind() int { return _BOT } +func (bot *_Bot) print() { print("bot") } + +// --- EOT end of text +type _Eot struct { + common +} + +func (eot *_Eot) kind() int { return _EOT } +func (eot *_Eot) print() { print("eot") } + +// --- CHAR a regular character +type _Char struct { + common; + char int; +} + +func (char *_Char) kind() int { return _CHAR } +func (char *_Char) print() { print("char ", string(char.char)) } + +func newChar(char int) *_Char { + c := new(_Char); + c.char = char; + return c; +} + +// --- CHARCLASS [a-z] + +type _CharClass struct { + common; + char int; + negate bool; // is character class negated? ([^a-z]) + // stored pairwise: [a-z] is (a,z); x is (x,x): + ranges []int; +} + +func (cclass *_CharClass) kind() int { return _CHARCLASS } + +func (cclass *_CharClass) print() { + print("charclass"); + if cclass.negate { + print(" (negated)"); + } + for i := 0; i < len(cclass.ranges); i += 2 { + l := cclass.ranges[i]; + r := cclass.ranges[i+1]; + if l == r { + print(" [", string(l), "]"); + } else { + print(" [", string(l), "-", string(r), "]"); + } + } +} + +func (cclass *_CharClass) addRange(a, b int) { + // range is a through b inclusive + n := len(cclass.ranges); + if n >= cap(cclass.ranges) { + nr := make([]int, n, 2*n); + for i, j := range nr { + nr[i] = j + } + cclass.ranges = nr; + } + cclass.ranges = cclass.ranges[0:n+2]; + cclass.ranges[n] = a; + n++; + cclass.ranges[n] = b; + n++; +} + +func (cclass *_CharClass) matches(c int) bool { + for i := 0; i < len(cclass.ranges); i = i+2 { + min := cclass.ranges[i]; + max := cclass.ranges[i+1]; + if min <= c && c <= max { + return !cclass.negate + } + } + return cclass.negate +} + +func newCharClass() *_CharClass { + c := new(_CharClass); + c.ranges = make([]int, 0, 20); + return c; +} + +// --- ANY any character +type _Any struct { + common +} + +func (any *_Any) kind() int { return _ANY } +func (any *_Any) print() { print("any") } + +// --- NOTNL any character but newline +type _NotNl struct { + common +} + +func (notnl *_NotNl) kind() int { return _NOTNL } +func (notnl *_NotNl) print() { print("notnl") } + +// --- BRA parenthesized expression +type _Bra struct { + common; + n int; // subexpression number +} + +func (bra *_Bra) kind() int { return _BRA } +func (bra *_Bra) print() { print("bra", bra.n); } + +// --- EBRA end of parenthesized expression +type _Ebra struct { + common; + n int; // subexpression number +} + +func (ebra *_Ebra) kind() int { return _EBRA } +func (ebra *_Ebra) print() { print("ebra ", ebra.n); } + +// --- ALT alternation +type _Alt struct { + common; + left instr; // other branch +} + +func (alt *_Alt) kind() int { return _ALT } +func (alt *_Alt) print() { print("alt(", alt.left.index(), ")"); } + +// --- NOP no operation +type _Nop struct { + common +} + +func (nop *_Nop) kind() int { return _NOP } +func (nop *_Nop) print() { print("nop") } + +// report error and exit compiling/executing goroutine +func (re *Regexp) setError(err string) { + re.error = err; + re.ch <- re; + runtime.Goexit(); +} + +func (re *Regexp) add(i instr) instr { + n := len(re.inst); + i.setIndex(len(re.inst)); + if n >= cap(re.inst) { + ni := make([]instr, n, 2*n); + for i, j := range ni { + ni[i] = j + } + re.inst = ni; + } + re.inst = re.inst[0:n+1]; + re.inst[n] = i; + return i; +} + +type parser struct { + re *Regexp; + nlpar int; // number of unclosed lpars + pos int; + ch int; +} + +const endOfFile = -1 + +func (p *parser) c() int { + return p.ch; +} + +func (p *parser) nextc() int { + if p.pos >= len(p.re.expr) { + p.ch = endOfFile + } else { + c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:len(p.re.expr)]); + p.ch = c; + p.pos += w; + } + return p.ch; +} + +func newParser(re *Regexp) *parser { + p := new(parser); + p.re = re; + p.nextc(); // load p.ch + return p; +} + +func (p *parser) regexp() (start, end instr) + +var iNULL instr + +func special(c int) bool { + s := `\.+*?()|[]^$`; + for i := 0; i < len(s); i++ { + if c == int(s[i]) { + return true + } + } + return false +} + +func specialcclass(c int) bool { + s := `\-[]`; + for i := 0; i < len(s); i++ { + if c == int(s[i]) { + return true + } + } + return false +} + +func (p *parser) charClass() instr { + cc := newCharClass(); + if p.c() == '^' { + cc.negate = true; + p.nextc(); + } + left := -1; + for { + switch c := p.c(); c { + case ']', endOfFile: + if left >= 0 { + p.re.setError(ErrBadRange); + } + // Is it [^\n]? + if cc.negate && len(cc.ranges) == 2 && + cc.ranges[0] == '\n' && cc.ranges[1] == '\n' { + nl := new(_NotNl); + p.re.add(nl); + return nl; + } + p.re.add(cc); + return cc; + case '-': // do this before backslash processing + p.re.setError(ErrBadRange); + case '\\': + c = p.nextc(); + switch { + case c == endOfFile: + p.re.setError(ErrExtraneousBackslash); + case c == 'n': + c = '\n'; + case specialcclass(c): + // c is as delivered + default: + p.re.setError(ErrBadBackslash); + } + fallthrough; + default: + p.nextc(); + switch { + case left < 0: // first of pair + if p.c() == '-' { // range + p.nextc(); + left = c; + } else { // single char + cc.addRange(c, c); + } + case left <= c: // second of pair + cc.addRange(left, c); + left = -1; + default: + p.re.setError(ErrBadRange); + } + } + } + return iNULL +} + +func (p *parser) term() (start, end instr) { + switch c := p.c(); c { + case '|', endOfFile: + return iNULL, iNULL; + case '*', '+': + p.re.setError(ErrBareClosure); + case ')': + if p.nlpar == 0 { + p.re.setError(ErrUnmatchedRpar); + } + return iNULL, iNULL; + case ']': + p.re.setError(ErrUnmatchedRbkt); + case '^': + p.nextc(); + start = p.re.add(new(_Bot)); + return start, start; + case '$': + p.nextc(); + start = p.re.add(new(_Eot)); + return start, start; + case '.': + p.nextc(); + start = p.re.add(new(_Any)); + return start, start; + case '[': + p.nextc(); + start = p.charClass(); + if p.c() != ']' { + p.re.setError(ErrUnmatchedLbkt); + } + p.nextc(); + return start, start; + case '(': + p.nextc(); + p.nlpar++; + p.re.nbra++; // increment first so first subexpr is \1 + nbra := p.re.nbra; + start, end = p.regexp(); + if p.c() != ')' { + p.re.setError(ErrUnmatchedLpar); + } + p.nlpar--; + p.nextc(); + bra := new(_Bra); + p.re.add(bra); + ebra := new(_Ebra); + p.re.add(ebra); + bra.n = nbra; + ebra.n = nbra; + if start == iNULL { + if end == iNULL { + p.re.setError(ErrInternal) + } + start = ebra + } else { + end.setNext(ebra); + } + bra.setNext(start); + return bra, ebra; + case '\\': + c = p.nextc(); + switch { + case c == endOfFile: + p.re.setError(ErrExtraneousBackslash); + case c == 'n': + c = '\n'; + case special(c): + // c is as delivered + default: + p.re.setError(ErrBadBackslash); + } + fallthrough; + default: + p.nextc(); + start = newChar(c); + p.re.add(start); + return start, start + } + panic("unreachable"); +} + +func (p *parser) closure() (start, end instr) { + start, end = p.term(); + if start == iNULL { + return + } + switch p.c() { + case '*': + // (start,end)*: + alt := new(_Alt); + p.re.add(alt); + end.setNext(alt); // after end, do alt + alt.left = start; // alternate brach: return to start + start = alt; // alt becomes new (start, end) + end = alt; + case '+': + // (start,end)+: + alt := new(_Alt); + p.re.add(alt); + end.setNext(alt); // after end, do alt + alt.left = start; // alternate brach: return to start + end = alt; // start is unchanged; end is alt + case '?': + // (start,end)?: + alt := new(_Alt); + p.re.add(alt); + nop := new(_Nop); + p.re.add(nop); + alt.left = start; // alternate branch is start + alt.setNext(nop); // follow on to nop + end.setNext(nop); // after end, go to nop + start = alt; // start is now alt + end = nop; // end is nop pointed to by both branches + default: + return + } + switch p.nextc() { + case '*', '+', '?': + p.re.setError(ErrBadClosure); + } + return +} + +func (p *parser) concatenation() (start, end instr) { + start, end = iNULL, iNULL; + for { + nstart, nend := p.closure(); + switch { + case nstart == iNULL: // end of this concatenation + if start == iNULL { // this is the empty string + nop := p.re.add(new(_Nop)); + return nop, nop; + } + return; + case start == iNULL: // this is first element of concatenation + start, end = nstart, nend; + default: + end.setNext(nstart); + end = nend; + } + } + panic("unreachable"); +} + +func (p *parser) regexp() (start, end instr) { + start, end = p.concatenation(); + for { + switch p.c() { + default: + return; + case '|': + p.nextc(); + nstart, nend := p.concatenation(); + alt := new(_Alt); + p.re.add(alt); + alt.left = start; + alt.setNext(nstart); + nop := new(_Nop); + p.re.add(nop); + end.setNext(nop); + nend.setNext(nop); + start, end = alt, nop; + } + } + panic("unreachable"); +} + +func unNop(i instr) instr { + for i.kind() == _NOP { + i = i.next() + } + return i +} + +func (re *Regexp) eliminateNops() { + for i := 0; i < len(re.inst); i++ { + inst := re.inst[i]; + if inst.kind() == _END { + continue + } + inst.setNext(unNop(inst.next())); + if inst.kind() == _ALT { + alt := inst.(*_Alt); + alt.left = unNop(alt.left); + } + } +} + +func (re *Regexp) dump() { + for i := 0; i < len(re.inst); i++ { + inst := re.inst[i]; + print(inst.index(), ": "); + inst.print(); + if inst.kind() != _END { + print(" -> ", inst.next().index()) + } + print("\n"); + } +} + +func (re *Regexp) doParse() { + p := newParser(re); + start := new(_Start); + re.add(start); + s, e := p.regexp(); + start.setNext(s); + re.start = start; + e.setNext(re.add(new(_End))); + + if debug { + re.dump(); + println(); + } + + re.eliminateNops(); + + if debug { + re.dump(); + println(); + } +} + + +func compiler(str string, ch chan *Regexp) { + re := new(Regexp); + re.expr = str; + re.inst = make([]instr, 0, 20); + re.ch = ch; + re.doParse(); + ch <- re; +} + +// CompileRegexp parses a regular expression and returns, if successful, a Regexp +// object that can be used to match against text. +func CompileRegexp(str string) (regexp *Regexp, error string) { + // Compile in a separate goroutine and wait for the result. + ch := make(chan *Regexp); + go compiler(str, ch); + re := <-ch; + return re, re.error +} + +type state struct { + inst instr; // next instruction to execute + match []int; // pairs of bracketing submatches. 0th is start,end +} + +// Append new state to to-do list. Leftmost-longest wins so avoid +// adding a state that's already active. +func addState(s []state, inst instr, match []int) []state { + index := inst.index(); + l := len(s); + pos := match[0]; + // TODO: Once the state is a vector and we can do insert, have inputs always + // go in order correctly and this "earlier" test is never necessary, + for i := 0; i < l; i++ { + if s[i].inst.index() == index && // same instruction + s[i].match[0] < pos { // earlier match already going; lefmost wins + return s + } + } + if l == cap(s) { + s1 := make([]state, 2*l)[0:l]; + for i := 0; i < l; i++ { + s1[i] = s[i]; + } + s = s1; + } + s = s[0:l+1]; + s[l].inst = inst; + s[l].match = match; + return s; +} + +// Accepts either string or bytes - the logic is identical either way. +// If bytes == nil, scan str. +func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { + var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements + s[0] = make([]state, 10)[0:0]; + s[1] = make([]state, 10)[0:0]; + in, out := 0, 1; + var final state; + found := false; + end := len(str); + if bytes != nil { + end = len(bytes) + } + for pos <= end { + if !found { + // prime the pump if we haven't seen a match yet + match := make([]int, 2*(re.nbra+1)); + for i := 0; i < len(match); i++ { + match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac" + } + match[0] = pos; + s[out] = addState(s[out], re.start.next(), match); + } + in, out = out, in; // old out state is new in state + s[out] = s[out][0:0]; // clear out state + if len(s[in]) == 0 { + // machine has completed + break; + } + charwidth := 1; + c := endOfFile; + if pos < end { + if bytes == nil { + c, charwidth = utf8.DecodeRuneInString(str[pos:end]); + } else { + c, charwidth = utf8.DecodeRune(bytes[pos:end]); + } + } + for i := 0; i < len(s[in]); i++ { + st := s[in][i]; + switch s[in][i].inst.kind() { + case _BOT: + if pos == 0 { + s[in] = addState(s[in], st.inst.next(), st.match) + } + case _EOT: + if pos == end { + s[in] = addState(s[in], st.inst.next(), st.match) + } + case _CHAR: + if c == st.inst.(*_Char).char { + s[out] = addState(s[out], st.inst.next(), st.match) + } + case _CHARCLASS: + if st.inst.(*_CharClass).matches(c) { + s[out] = addState(s[out], st.inst.next(), st.match) + } + case _ANY: + if c != endOfFile { + s[out] = addState(s[out], st.inst.next(), st.match) + } + case _NOTNL: + if c != endOfFile && c != '\n' { + s[out] = addState(s[out], st.inst.next(), st.match) + } + case _BRA: + n := st.inst.(*_Bra).n; + st.match[2*n] = pos; + s[in] = addState(s[in], st.inst.next(), st.match); + case _EBRA: + n := st.inst.(*_Ebra).n; + st.match[2*n+1] = pos; + s[in] = addState(s[in], st.inst.next(), st.match); + case _ALT: + s[in] = addState(s[in], st.inst.(*_Alt).left, st.match); + // give other branch a copy of this match vector + s1 := make([]int, 2*(re.nbra+1)); + for i := 0; i < len(s1); i++ { + s1[i] = st.match[i] + } + s[in] = addState(s[in], st.inst.next(), s1); + case _END: + // choose leftmost longest + if !found || // first + st.match[0] < final.match[0] || // leftmost + (st.match[0] == final.match[0] && pos > final.match[1]) { // longest + final = st; + final.match[1] = pos; + } + found = true; + default: + st.inst.print(); + panic("unknown instruction in execute"); + } + } + pos += charwidth; + } + return final.match; +} + + +// ExecuteString matches the Regexp against the string s. +// The return value is an array of integers, in pairs, identifying the positions of +// substrings matched by the expression. +// s[a[0]:a[1]] is the substring matched by the entire expression. +// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression. +// A negative value means the subexpression did not match any element of the string. +// An empty array means "no match". +func (re *Regexp) ExecuteString(s string) (a []int) { + return re.doExecute(s, nil, 0) +} + + +// Execute matches the Regexp against the byte slice b. +// The return value is an array of integers, in pairs, identifying the positions of +// subslices matched by the expression. +// b[a[0]:a[1]] is the subslice matched by the entire expression. +// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression. +// A negative value means the subexpression did not match any element of the slice. +// An empty array means "no match". +func (re *Regexp) Execute(b []byte) (a []int) { + return re.doExecute("", b, 0) +} + + +// MatchString returns whether the Regexp matches the string s. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) MatchString(s string) bool { + return len(re.doExecute(s, nil, 0)) > 0 +} + + +// Match returns whether the Regexp matches the byte slice b. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) Match(b []byte) bool { + return len(re.doExecute("", b, 0)) > 0 +} + + +// MatchStrings matches the Regexp against the string s. +// The return value is an array of strings matched by the expression. +// a[0] is the substring matched by the entire expression. +// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression. +// An empty array means ``no match''. +func (re *Regexp) MatchStrings(s string) (a []string) { + r := re.doExecute(s, nil, 0); + if r == nil { + return nil + } + a = make([]string, len(r)/2); + for i := 0; i < len(r); i += 2 { + if r[i] != -1 { // -1 means no match for this subexpression + a[i/2] = s[r[i] : r[i+1]] + } + } + return +} + +// MatchSlices matches the Regexp against the byte slice b. +// The return value is an array of subslices matched by the expression. +// a[0] is the subslice matched by the entire expression. +// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression. +// An empty array means ``no match''. +func (re *Regexp) MatchSlices(b []byte) (a [][]byte) { + r := re.doExecute("", b, 0); + if r == nil { + return nil + } + a = make([][]byte, len(r)/2); + for i := 0; i < len(r); i += 2 { + if r[i] != -1 { // -1 means no match for this subexpression + a[i/2] = b[r[i] : r[i+1]] + } + } + return +} + +// MatchString checks whether a textual regular expression +// matches a string. More complicated queries need +// to use Compile and the full Regexp interface. +func MatchString(pattern string, s string) (matched bool, error string) { + re, err := CompileRegexp(pattern); + if err != "" { + return false, err + } + return re.MatchString(s), "" +} + +// Match checks whether a textual regular expression +// matches a byte slice. More complicated queries need +// to use Compile and the full Regexp interface. +func Match(pattern string, b []byte) (matched bool, error string) { + re, err := CompileRegexp(pattern); + if err != "" { + return false, err + } + return re.Match(b), "" +} diff --git a/src/pkg/testing/regexp_test.go b/src/pkg/testing/regexp_test.go new file mode 100644 index 00000000000..d72ca19d489 --- /dev/null +++ b/src/pkg/testing/regexp_test.go @@ -0,0 +1,281 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package testing + +import ( + "strings"; + "testing"; +) + +var good_re = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[]`, + `[abc]`, + `[^1234]`, + `[^\n]`, +} + +// TODO: nice to do this with a map +type stringError struct { + re string; + err string; +} +var bad_re = []stringError{ + stringError{ `*`, ErrBareClosure }, + stringError{ `(abc`, ErrUnmatchedLpar }, + stringError{ `abc)`, ErrUnmatchedRpar }, + stringError{ `x[a-z`, ErrUnmatchedLbkt }, + stringError{ `abc]`, ErrUnmatchedRbkt }, + stringError{ `[z-a]`, ErrBadRange }, + stringError{ `abc\`, ErrExtraneousBackslash }, + stringError{ `a**`, ErrBadClosure }, + stringError{ `a*+`, ErrBadClosure }, + stringError{ `a??`, ErrBadClosure }, + stringError{ `*`, ErrBareClosure }, + stringError{ `\x`, ErrBadBackslash }, +} + +type vec []int; + +type tester struct { + re string; + text string; + match vec; +} + +var matches = []tester { + tester{ ``, "", vec{0,0} }, + tester{ `a`, "a", vec{0,1} }, + tester{ `x`, "y", vec{} }, + tester{ `b`, "abc", vec{1,2} }, + tester{ `.`, "a", vec{0,1} }, + tester{ `.*`, "abcdef", vec{0,6} }, + tester{ `^abcd$`, "abcd", vec{0,4} }, + tester{ `^bcd'`, "abcdef", vec{} }, + tester{ `^abcd$`, "abcde", vec{} }, + tester{ `a+`, "baaab", vec{1,4} }, + tester{ `a*`, "baaab", vec{0,0} }, + tester{ `[a-z]+`, "abcd", vec{0,4} }, + tester{ `[^a-z]+`, "ab1234cd", vec{2,6} }, + tester{ `[a\-\]z]+`, "az]-bcz", vec{0,4} }, + tester{ `[^\n]+`, "abcd\n", vec{0,4} }, + tester{ `[日本語]+`, "日本語日本語", vec{0,18} }, + tester{ `()`, "", vec{0,0, 0,0} }, + tester{ `(a)`, "a", vec{0,1, 0,1} }, + tester{ `(.)(.)`, "日a", vec{0,4, 0,3, 3,4} }, + tester{ `(.*)`, "", vec{0,0, 0,0} }, + tester{ `(.*)`, "abcd", vec{0,4, 0,4} }, + tester{ `(..)(..)`, "abcd", vec{0,4, 0,2, 2,4} }, + tester{ `(([^xyz]*)(d))`, "abcd", vec{0,4, 0,4, 0,3, 3,4} }, + tester{ `((a|b|c)*(d))`, "abcd", vec{0,4, 0,4, 2,3, 3,4} }, + tester{ `(((a|b|c)*)(d))`, "abcd", vec{0,4, 0,4, 0,3, 2,3, 3,4} }, + tester{ `a*(|(b))c*`, "aacc", vec{0,4, 2,2, -1,-1} }, +} + +func compileTest(t *testing.T, expr string, error string) *Regexp { + re, err := CompileRegexp(expr); + if err != error { + t.Error("compiling `", expr, "`; unexpected error: ", err); + } + return re +} + +func printVec(t *testing.T, m []int) { + l := len(m); + if l == 0 { + t.Log("\t"); + } else { + for i := 0; i < l; i = i+2 { + t.Log("\t", m[i], ",", m[i+1]) + } + } +} + +func printStrings(t *testing.T, m []string) { + l := len(m); + if l == 0 { + t.Log("\t"); + } else { + for i := 0; i < l; i = i+2 { + t.Logf("\t%q", m[i]) + } + } +} + +func printBytes(t *testing.T, b [][]byte) { + l := len(b); + if l == 0 { + t.Log("\t"); + } else { + for i := 0; i < l; i = i+2 { + t.Logf("\t%q", b[i]) + } + } +} + +func equal(m1, m2 []int) bool { + l := len(m1); + if l != len(m2) { + return false + } + for i := 0; i < l; i++ { + if m1[i] != m2[i] { + return false + } + } + return true +} + +func equalStrings(m1, m2 []string) bool { + l := len(m1); + if l != len(m2) { + return false + } + for i := 0; i < l; i++ { + if m1[i] != m2[i] { + return false + } + } + return true +} + +func equalBytes(m1 [][]byte, m2 []string) bool { + l := len(m1); + if l != len(m2) { + return false + } + for i := 0; i < l; i++ { + if string(m1[i]) != m2[i] { + return false + } + } + return true +} + +func executeTest(t *testing.T, expr string, str string, match []int) { + re := compileTest(t, expr, ""); + if re == nil { + return + } + m := re.ExecuteString(str); + if !equal(m, match) { + t.Error("ExecuteString failure on `", expr, "` matching `", str, "`:"); + printVec(t, m); + t.Log("should be:"); + printVec(t, match); + } + // now try bytes + m = re.Execute(strings.Bytes(str)); + if !equal(m, match) { + t.Error("Execute failure on `", expr, "` matching `", str, "`:"); + printVec(t, m); + t.Log("should be:"); + printVec(t, match); + } +} + +func TestGoodCompile(t *testing.T) { + for i := 0; i < len(good_re); i++ { + compileTest(t, good_re[i], ""); + } +} + +func TestBadCompile(t *testing.T) { + for i := 0; i < len(bad_re); i++ { + compileTest(t, bad_re[i].re, bad_re[i].err) + } +} + +func TestExecute(t *testing.T) { + for i := 0; i < len(matches); i++ { + test := &matches[i]; + executeTest(t, test.re, test.text, test.match) + } +} + +func matchTest(t *testing.T, expr string, str string, match []int) { + re := compileTest(t, expr, ""); + if re == nil { + return + } + m := re.MatchString(str); + if m != (len(match) > 0) { + t.Error("MatchString failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0); + } + // now try bytes + m = re.Match(strings.Bytes(str)); + if m != (len(match) > 0) { + t.Error("Match failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0); + } +} + +func TestMatch(t *testing.T) { + for i := 0; i < len(matches); i++ { + test := &matches[i]; + matchTest(t, test.re, test.text, test.match) + } +} + +func matchStringsTest(t *testing.T, expr string, str string, match []int) { + re := compileTest(t, expr, ""); + if re == nil { + return + } + strs := make([]string, len(match)/2); + for i := 0; i < len(match); i++ { + strs[i/2] = str[match[i] : match[i+1]] + } + m := re.MatchStrings(str); + if !equalStrings(m, strs) { + t.Error("MatchStrings failure on `", expr, "` matching `", str, "`:"); + printStrings(t, m); + t.Log("should be:"); + printStrings(t, strs); + } + // now try bytes + s := re.MatchSlices(strings.Bytes(str)); + if !equalBytes(s, strs) { + t.Error("MatchSlices failure on `", expr, "` matching `", str, "`:"); + printBytes(t, s); + t.Log("should be:"); + printStrings(t, strs); + } +} + +func TestMatchStrings(t *testing.T) { + for i := 0; i < len(matches); i++ { + test := &matches[i]; + matchTest(t, test.re, test.text, test.match) + } +} + +func matchFunctionTest(t *testing.T, expr string, str string, match []int) { + m, err := MatchString(expr, str); + if err == "" { + return + } + if m != (len(match) > 0) { + t.Error("function Match failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0); + } +} + +func TestMatchFunction(t *testing.T) { + for i := 0; i < len(matches); i++ { + test := &matches[i]; + matchFunctionTest(t, test.re, test.text, test.match) + } +} diff --git a/src/pkg/testing/testing.go b/src/pkg/testing/testing.go index e4608ef7238..27612939389 100644 --- a/src/pkg/testing/testing.go +++ b/src/pkg/testing/testing.go @@ -15,8 +15,8 @@ import ( "flag"; "fmt"; "os"; - "regexp"; "runtime"; + "testing"; ) // Report as tests are run; default is silent for success. @@ -122,9 +122,9 @@ func Main(tests []Test) { if len(tests) == 0 { println("testing: warning: no tests to run"); } - re, err := regexp.Compile(*match); - if err != nil { - println("invalid regexp for -match:", err.String()); + re, err := CompileRegexp(*match); + if err != "" { + println("invalid regexp for -match:", err); os.Exit(1); } for i := 0; i < len(tests); i++ {