regexp: add support for matching text read from things that implement

ReadRune. (If you have a Reader but not a RuneReader, use bufio.) The matching code is a few percent slower but significantly cleaner. R=rsc CC=golang-dev https://golang.org/cl/4125046
2024-11-22 00:34:40 -07:00 · 2011-02-03 13:58:40 -08:00 · 2011-02-03 13:58:40 -08:00 · 7db904c1f6
commit 7db904c1f6
parent 63457d089e
2 changed files with 236 additions and 73 deletions
--- a/src/pkg/regexp/find_test.go
+++ b/src/pkg/regexp/find_test.go
@ -6,6 +6,7 @@ package regexp
 import (
 	"fmt"
 	"strings"
 	"testing"
 )
@ -191,6 +192,12 @@ func TestFindStringIndex(t *testing.T) {
 	}
 }
 func TestFindReaderIndex(t *testing.T) {
 	for _, test := range findTests {
 		testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
 	}
 }
 // Now come the simple All cases.
 func TestFindAll(t *testing.T) {
@ -387,6 +394,12 @@ func TestFindStringSubmatchIndex(t *testing.T) {
 	}
 }
 func TestFindReaderSubmatchIndex(t *testing.T) {
 	for _, test := range findTests {
 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
 	}
 }
 // Now come the monster AllSubmatch cases.
 func TestFindAllSubmatch(t *testing.T) {
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@ -54,6 +54,16 @@
 // text of the match/submatch.  If an index is negative, it means that
 // subexpression did not match any string in the input.
 //
 // There is also a subset of the methods that can be applied to text read
 // from a RuneReader:
 //
 //	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
 //
 // This set may grow.  Note that regular expression matches may need to
 // examine text beyond the text returned by a match, so the methods that
 // match text from a RuneReader may read arbitrarily far into the input
 // before returning.
 //
 // (There are a few other methods that do not match this pattern.)
 //
 package regexp
@ -231,13 +241,13 @@ func (p *parser) error(err Error) {
 	panic(err)
 }
-const endOfFile = -1
+const endOfText = -1
 func (p *parser) c() int { return p.ch }
 func (p *parser) nextc() int {
 	if p.pos >= len(p.re.expr) {
-		p.ch = endOfFile
+		p.ch = endOfText
 	} else {
 		c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
 		p.ch = c
@ -288,7 +298,7 @@ func (p *parser) checkBackslash() int {
 	if c == '\\' {
 		c = p.nextc()
 		switch {
-		case c == endOfFile:
+		case c == endOfText:
 			p.error(ErrExtraneousBackslash)
 		case ispunct(c):
 			// c is as delivered
@ -311,7 +321,7 @@ func (p *parser) charClass() *instr {
 	left := -1
 	for {
 		switch c := p.c(); c {
-		case ']', endOfFile:
+		case ']', endOfText:
 			if left >= 0 {
 				p.error(ErrBadRange)
 			}
@ -356,7 +366,7 @@ func (p *parser) charClass() *instr {
 func (p *parser) term() (start, end *instr) {
 	switch c := p.c(); c {
-	case '|', endOfFile:
+	case '|', endOfText:
 		return nil, nil
 	case '*', '+', '?':
 		p.error(ErrBareClosure)
@ -638,8 +648,11 @@ func (re *Regexp) NumSubexp() int { return re.nbra }
 // match vectors away as we execute.  Matches are ref counted and returned
 // to a free list when no longer active.  Increases a simple benchmark by 22X.
 type matchArena struct {
-	head *matchVec
+	head  *matchVec
-	len  int // length of match vector
+	len   int // length of match vector
 	pos   int
 	atBOT bool // whether we're at beginning of text
 	atEOT bool // whether we're at end of text
 }
 type matchVec struct {
@ -699,21 +712,21 @@ type state struct {
 // Append new state to to-do list.  Leftmost-longest wins so avoid
 // adding a state that's already active.  The matchVec will be inc-ref'ed
 // if it is assigned to a state.
-func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state {
+func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
 	switch inst.kind {
 	case iBOT:
-		if pos == 0 {
+		if a.atBOT {
-			s = a.addState(s, inst.next, prefixed, match, pos, end)
+			s = a.addState(s, inst.next, prefixed, match)
 		}
 		return s
 	case iEOT:
-		if pos == end {
+		if a.atEOT {
-			s = a.addState(s, inst.next, prefixed, match, pos, end)
+			s = a.addState(s, inst.next, prefixed, match)
 		}
 		return s
 	case iBra:
-		match.m[inst.braNum] = pos
+		match.m[inst.braNum] = a.pos
-		s = a.addState(s, inst.next, prefixed, match, pos, end)
+		s = a.addState(s, inst.next, prefixed, match)
 		return s
 	}
 	l := len(s)
@ -727,62 +740,157 @@ func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matc
 	s = append(s, state{inst, prefixed, match})
 	match.ref++
 	if inst.kind == iAlt {
-		s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end)
+		s = a.addState(s, inst.left, prefixed, a.copy(match))
 		// give other branch a copy of this match vector
-		s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end)
+		s = a.addState(s, inst.next, prefixed, a.copy(match))
 	}
 	return s
 }
-// Accepts either string or bytes - the logic is identical either way.
+// input abstracts different representations of the input text. It provides
-// If bytes == nil, scan str.
+// one-character lookahead.
-func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
+type input interface {
 	step(pos int) (rune int, width int) // advance one rune
 	canCheckPrefix() bool               // can we look ahead without losing info?
 	hasPrefix(re *Regexp) bool
 	index(re *Regexp, pos int) int
 }
 // inputString scans a string.
 type inputString struct {
 	str string
 }
 func newInputString(str string) *inputString {
 	return &inputString{str: str}
 }
 func (i *inputString) step(pos int) (int, int) {
 	if pos < len(i.str) {
 		return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
 	}
 	return endOfText, 0
 }
 func (i *inputString) canCheckPrefix() bool {
 	return true
 }
 func (i *inputString) hasPrefix(re *Regexp) bool {
 	return strings.HasPrefix(i.str, re.prefix)
 }
 func (i *inputString) index(re *Regexp, pos int) int {
 	return strings.Index(i.str[pos:], re.prefix)
 }
 // inputBytes scans a byte slice.
 type inputBytes struct {
 	str []byte
 }
 func newInputBytes(str []byte) *inputBytes {
 	return &inputBytes{str: str}
 }
 func (i *inputBytes) step(pos int) (int, int) {
 	if pos < len(i.str) {
 		return utf8.DecodeRune(i.str[pos:len(i.str)])
 	}
 	return endOfText, 0
 }
 func (i *inputBytes) canCheckPrefix() bool {
 	return true
 }
 func (i *inputBytes) hasPrefix(re *Regexp) bool {
 	return bytes.HasPrefix(i.str, re.prefixBytes)
 }
 func (i *inputBytes) index(re *Regexp, pos int) int {
 	return bytes.Index(i.str[pos:], re.prefixBytes)
 }
 // inputReader scans a RuneReader.
 type inputReader struct {
 	r     io.RuneReader
 	atEOT bool
 	pos   int
 }
 func newInputReader(r io.RuneReader) *inputReader {
 	return &inputReader{r: r}
 }
 func (i *inputReader) step(pos int) (int, int) {
 	if !i.atEOT && pos != i.pos {
 		return endOfText, 0
 	}
 	r, w, err := i.r.ReadRune()
 	if err != nil {
 		i.atEOT = true
 		return endOfText, 0
 	}
 	i.pos += w
 	return r, w
 }
 func (i *inputReader) canCheckPrefix() bool {
 	return false
 }
 func (i *inputReader) hasPrefix(re *Regexp) bool {
 	return false
 }
 func (i *inputReader) index(re *Regexp, pos int) int {
 	return -1
 }
 // Search match starting from pos bytes into the input.
 func (re *Regexp) doExecute(i input, pos int) []int {
 	var s [2][]state
 	s[0] = make([]state, 0, 10)
 	s[1] = make([]state, 0, 10)
 	in, out := 0, 1
 	var final state
 	found := false
 	end := len(str)
 	if bytestr != nil {
 		end = len(bytestr)
 	}
 	anchored := re.inst[0].next.kind == iBOT
 	if anchored && pos > 0 {
 		return nil
 	}
 	// fast check for initial plain substring
-	if re.prefix != "" {
+	if i.canCheckPrefix() && re.prefix != "" {
 		advance := 0
 		if anchored {
-			if bytestr == nil {
+			if !i.hasPrefix(re) {
-				if !strings.HasPrefix(str, re.prefix) {
+				return nil
 					return nil
 				}
 			} else {
 				if !bytes.HasPrefix(bytestr, re.prefixBytes) {
 					return nil
 				}
 			}
 		} else {
-			if bytestr == nil {
+			advance = i.index(re, pos)
-				advance = strings.Index(str[pos:], re.prefix)
+			if advance == -1 {
-			} else {
+				return nil
 				advance = bytes.Index(bytestr[pos:], re.prefixBytes)
 			}
 		}
 		if advance == -1 {
 			return nil
 		}
 		pos += advance
 	}
-	arena := &matchArena{nil, 2 * (re.nbra + 1)}
+	// We look one character ahead so we can match $, which checks whether
-	for startPos := pos; pos <= end; {
+	// we are at EOT.
 	nextChar, nextWidth := i.step(pos)
 	arena := &matchArena{
 		len:   2 * (re.nbra + 1),
 		pos:   pos,
 		atBOT: pos == 0,
 		atEOT: nextChar == endOfText,
 	}
 	for c, startPos := 0, pos; c != endOfText; {
 		if !found && (pos == startPos || !anchored) {
 			// prime the pump if we haven't seen a match yet
 			match := arena.noMatch()
 			match.m[0] = pos
-			s[out] = arena.addState(s[out], re.start.next, false, match, pos, end)
+			s[out] = arena.addState(s[out], re.start.next, false, match)
 			arena.free(match) // if addState saved it, ref was incremented
 		} else if len(s[out]) == 0 {
 			// machine has completed
@ -795,35 +903,32 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
 			arena.free(state.match)
 		}
 		s[out] = old[0:0] // truncate state vector
-		charwidth := 1
+		c = nextChar
-		c := endOfFile
+		thisPos := pos
-		if pos < end {
+		pos += nextWidth
-			if bytestr == nil {
+		nextChar, nextWidth = i.step(pos)
-				c, charwidth = utf8.DecodeRuneInString(str[pos:end])
+		arena.atEOT = nextChar == endOfText
-			} else {
+		arena.atBOT = false
-				c, charwidth = utf8.DecodeRune(bytestr[pos:end])
+		arena.pos = pos
 			}
 		}
 		pos += charwidth
 		for _, st := range s[in] {
 			switch st.inst.kind {
 			case iBOT:
 			case iEOT:
 			case iChar:
 				if c == st.inst.char {
-					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
 				}
 			case iCharClass:
 				if st.inst.cclass.matches(c) {
-					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
 				}
 			case iAny:
-				if c != endOfFile {
+				if c != endOfText {
-					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
 				}
 			case iNotNL:
-				if c != endOfFile && c != '\n' {
+				if c != endOfText && c != '\n' {
-					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
 				}
 			case iBra:
 			case iAlt:
@ -831,13 +936,13 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
 				// choose leftmost longest
 				if !found || // first
 					st.match.m[0] < final.match.m[0] || // leftmost
-					(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
+					(st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
 					if final.match != nil {
 						arena.free(final.match)
 					}
 					final = st
 					final.match.ref++
-					final.match.m[1] = pos - charwidth
+					final.match.m[1] = thisPos
 				}
 				found = true
 			default:
@ -874,14 +979,31 @@ func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
 	return string(c[:i]), true
 }
 // MatchReader returns whether the Regexp matches the text read by the
 // RuneReader.  The return value is a boolean: true for match, false for no
 // match.
 func (re *Regexp) MatchReader(r io.RuneReader) bool {
 	return len(re.doExecute(newInputReader(r), 0)) > 0
 }
 // MatchString returns whether the Regexp matches the string s.
 // The return value is a boolean: true for match, false for no match.
-func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
+func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }
 // Match returns whether the Regexp matches the byte slice b.
 // The return value is a boolean: true for match, false for no match.
-func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
+func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }
 // MatchReader checks whether a textual regular expression matches the text
 // read by the RuneReader.  More complicated queries need to use Compile and
 // the full Regexp interface.
 func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
 	re, err := Compile(pattern)
 	if err != nil {
 		return false, err
 	}
 	return re.MatchReader(r), nil
 }
 // MatchString checks whether a textual regular expression
 // matches a string.  More complicated queries need
@ -921,7 +1043,7 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str
 	searchPos := 0    // position where we next look for a match
 	buf := new(bytes.Buffer)
 	for searchPos <= len(src) {
-		a := re.doExecute(src, nil, searchPos)
+		a := re.doExecute(newInputString(src), searchPos)
 		if len(a) == 0 {
 			break // no more matches
 		}
@ -973,7 +1095,7 @@ func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 	searchPos := 0    // position where we next look for a match
 	buf := new(bytes.Buffer)
 	for searchPos <= len(src) {
-		a := re.doExecute("", src, searchPos)
+		a := re.doExecute(newInputBytes(src), searchPos)
 		if len(a) == 0 {
 			break // no more matches
 		}
@ -1038,7 +1160,13 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 	}
 	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
-		matches := re.doExecute(s, b, pos)
+		var in input
 		if b == nil {
 			in = newInputString(s)
 		} else {
 			in = newInputBytes(b)
 		}
 		matches := re.doExecute(in, pos)
 		if len(matches) == 0 {
 			break
 		}
@ -1052,6 +1180,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 				accept = false
 			}
 			var width int
 			// TODO: use step()
 			if b == nil {
 				_, width = utf8.DecodeRuneInString(s[pos:end])
 			} else {
@ -1077,7 +1206,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
 // A return value of nil indicates no match.
 func (re *Regexp) Find(b []byte) []byte {
-	a := re.doExecute("", b, 0)
+	a := re.doExecute(newInputBytes(b), 0)
 	if a == nil {
 		return nil
 	}
@ -1089,7 +1218,7 @@ func (re *Regexp) Find(b []byte) []byte {
 // b[loc[0]:loc[1]].
 // A return value of nil indicates no match.
 func (re *Regexp) FindIndex(b []byte) (loc []int) {
-	a := re.doExecute("", b, 0)
+	a := re.doExecute(newInputBytes(b), 0)
 	if a == nil {
 		return nil
 	}
@ -1102,7 +1231,7 @@ func (re *Regexp) FindIndex(b []byte) (loc []int) {
 // an empty string.  Use FindStringIndex or FindStringSubmatch if it is
 // necessary to distinguish these cases.
 func (re *Regexp) FindString(s string) string {
-	a := re.doExecute(s, nil, 0)
+	a := re.doExecute(newInputString(s), 0)
 	if a == nil {
 		return ""
 	}
@ -1114,7 +1243,19 @@ func (re *Regexp) FindString(s string) string {
 // itself is at s[loc[0]:loc[1]].
 // A return value of nil indicates no match.
 func (re *Regexp) FindStringIndex(s string) []int {
-	a := re.doExecute(s, nil, 0)
+	a := re.doExecute(newInputString(s), 0)
 	if a == nil {
 		return nil
 	}
 	return a[0:2]
 }
 // FindReaderIndex returns a two-element slice of integers defining the
 // location of the leftmost match of the regular expression in text read from
 // the RuneReader.  The match itself is at s[loc[0]:loc[1]].  A return
 // value of nil indicates no match.
 func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
 	a := re.doExecute(newInputReader(r), 0)
 	if a == nil {
 		return nil
 	}
@ -1127,7 +1268,7 @@ func (re *Regexp) FindStringIndex(s string) []int {
 // comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
-	a := re.doExecute("", b, 0)
+	a := re.doExecute(newInputBytes(b), 0)
 	if a == nil {
 		return nil
 	}
@ -1146,7 +1287,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 // in the package comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
-	return re.doExecute("", b, 0)
+	return re.doExecute(newInputBytes(b), 0)
 }
 // FindStringSubmatch returns a slice of strings holding the text of the
@ -1155,7 +1296,7 @@ func (re *Regexp) FindSubmatchIndex(b []byte) []int {
 // package comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindStringSubmatch(s string) []string {
-	a := re.doExecute(s, nil, 0)
+	a := re.doExecute(newInputString(s), 0)
 	if a == nil {
 		return nil
 	}
@ -1174,7 +1315,16 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
 // 'Index' descriptions in the package comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
-	return re.doExecute(s, nil, 0)
+	return re.doExecute(newInputString(s), 0)
 }
 // FindReaderSubmatchIndex returns a slice holding the index pairs
 // identifying the leftmost match of the regular expression of text read by
 // the RuneReader, and the matches, if any, of its subexpressions, as defined
 // by the 'Submatch' and 'Index' descriptions in the package comment.  A
 // return value of nil indicates no match.
 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
 	return re.doExecute(newInputReader(r), 0)
 }
 const startSize = 10 // The size at which to start a slice in the 'All' routines.