mirror of
https://github.com/golang/go
synced 2024-11-24 22:57:57 -07:00
exp/regexp: bug fixes and RE2 tests
Also add exp/regexp to build (forgot before). At this point I am very confident in exp/regexp's behavior. It should be usable as a drop-in replacement for regexp now. Later CLs could introduce a CompilePOSIX to get at traditional POSIX ``extended regular expressions'' as in egrep and also an re.MatchLongest method to change the matching mode to leftmost longest instead of leftmost first. On the other hand, I expect very few people to use either. R=r, r, gustavo CC=golang-dev https://golang.org/cl/4990041
This commit is contained in:
parent
a2c2c87439
commit
08ae1a5a23
@ -81,6 +81,7 @@ DIRS=\
|
|||||||
exp/gui\
|
exp/gui\
|
||||||
exp/gui/x11\
|
exp/gui/x11\
|
||||||
exp/norm\
|
exp/norm\
|
||||||
|
exp/regexp\
|
||||||
exp/regexp/syntax\
|
exp/regexp/syntax\
|
||||||
exp/template/html\
|
exp/template/html\
|
||||||
expvar\
|
expvar\
|
||||||
|
@ -90,23 +90,12 @@ func (m *machine) match(i input, pos int) bool {
|
|||||||
if rune != endOfText {
|
if rune != endOfText {
|
||||||
rune1, width1 = i.step(pos + width)
|
rune1, width1 = i.step(pos + width)
|
||||||
}
|
}
|
||||||
// TODO: Let caller specify the initial flag setting.
|
|
||||||
// For now assume pos == 0 is beginning of text and
|
|
||||||
// pos != 0 is not even beginning of line.
|
|
||||||
// TODO: Word boundary.
|
|
||||||
var flag syntax.EmptyOp
|
var flag syntax.EmptyOp
|
||||||
if pos == 0 {
|
if pos == 0 {
|
||||||
flag = syntax.EmptyBeginText | syntax.EmptyBeginLine
|
flag = syntax.EmptyOpContext(-1, rune)
|
||||||
|
} else {
|
||||||
|
flag = i.context(pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update flag using lookahead rune.
|
|
||||||
if rune1 == '\n' {
|
|
||||||
flag |= syntax.EmptyEndLine
|
|
||||||
}
|
|
||||||
if rune1 == endOfText {
|
|
||||||
flag |= syntax.EmptyEndText
|
|
||||||
}
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
if len(runq.dense) == 0 {
|
if len(runq.dense) == 0 {
|
||||||
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
|
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
|
||||||
@ -134,17 +123,7 @@ func (m *machine) match(i input, pos int) bool {
|
|||||||
}
|
}
|
||||||
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
|
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
|
||||||
}
|
}
|
||||||
// TODO: word boundary
|
flag = syntax.EmptyOpContext(rune, rune1)
|
||||||
flag = 0
|
|
||||||
if rune == '\n' {
|
|
||||||
flag |= syntax.EmptyBeginLine
|
|
||||||
}
|
|
||||||
if rune1 == '\n' {
|
|
||||||
flag |= syntax.EmptyEndLine
|
|
||||||
}
|
|
||||||
if rune1 == endOfText {
|
|
||||||
flag |= syntax.EmptyEndText
|
|
||||||
}
|
|
||||||
m.step(runq, nextq, pos, pos+width, rune, flag)
|
m.step(runq, nextq, pos, pos+width, rune, flag)
|
||||||
if width == 0 {
|
if width == 0 {
|
||||||
break
|
break
|
||||||
|
271
src/pkg/exp/regexp/exec_test.go
Normal file
271
src/pkg/exp/regexp/exec_test.go
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
// Copyright 2010 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package regexp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"compress/gzip"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestRE2 tests this package's regexp API against test cases
|
||||||
|
// considered during RE2's exhaustive tests, which run all possible
|
||||||
|
// regexps over a given set of atoms and operators, up to a given
|
||||||
|
// complexity, over all possible strings over a given alphabet,
|
||||||
|
// up to a given size. Rather than try to link with RE2, we read a
|
||||||
|
// log file containing the test cases and the expected matches.
|
||||||
|
// The log file, re2.txt, is generated by running 'make exhaustive-log'
|
||||||
|
// in the open source RE2 distribution. http://code.google.com/p/re2/
|
||||||
|
//
|
||||||
|
// The test file format is a sequence of stanzas like:
|
||||||
|
//
|
||||||
|
// strings
|
||||||
|
// "abc"
|
||||||
|
// "123x"
|
||||||
|
// regexps
|
||||||
|
// "[a-z]+"
|
||||||
|
// 0-3;0-3
|
||||||
|
// -;-
|
||||||
|
// "([0-9])([0-9])([0-9])"
|
||||||
|
// -;-
|
||||||
|
// -;0-3 0-1 1-2 2-3
|
||||||
|
//
|
||||||
|
// The stanza begins by defining a set of strings, quoted
|
||||||
|
// using Go double-quote syntax, one per line. Then the
|
||||||
|
// regexps section gives a sequence of regexps to run on
|
||||||
|
// the strings. In the block that follows a regexp, each line
|
||||||
|
// gives the semicolon-separated match results of running
|
||||||
|
// the regexp on the corresponding string.
|
||||||
|
// Each match result is either a single -, meaning no match, or a
|
||||||
|
// space-separated sequence of pairs giving the match and
|
||||||
|
// submatch indices. An unmatched subexpression formats
|
||||||
|
// its pair as a single - (not illustrated above). For now
|
||||||
|
// each regexp run produces two match results, one for a
|
||||||
|
// ``full match'' that restricts the regexp to matching the entire
|
||||||
|
// string or nothing, and one for a ``partial match'' that gives
|
||||||
|
// the leftmost first match found in the string.
|
||||||
|
//
|
||||||
|
// Lines beginning with # are comments. Lines beginning with
|
||||||
|
// a capital letter are test names printed during RE2's test suite
|
||||||
|
// and are echoed into t but otherwise ignored.
|
||||||
|
//
|
||||||
|
// At time of writing, re2.txt is 32 MB but compresses to 760 kB,
|
||||||
|
// so we store re2.txt.gz in the repository and decompress it on the fly.
|
||||||
|
//
|
||||||
|
func TestRE2(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Log("skipping TestRE2 during short test")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open("re2.txt.gz")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
gz, err := gzip.NewReader(f)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("decompress re2.txt.gz: %v", err)
|
||||||
|
}
|
||||||
|
defer gz.Close()
|
||||||
|
lineno := 0
|
||||||
|
r := bufio.NewReader(gz)
|
||||||
|
var (
|
||||||
|
str []string
|
||||||
|
input []string
|
||||||
|
inStrings bool
|
||||||
|
re *Regexp
|
||||||
|
refull *Regexp
|
||||||
|
nfail int
|
||||||
|
ncase int
|
||||||
|
)
|
||||||
|
for {
|
||||||
|
line, err := r.ReadString('\n')
|
||||||
|
if err != nil {
|
||||||
|
if err == os.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
t.Fatalf("re2.txt:%d: %v", lineno, err)
|
||||||
|
}
|
||||||
|
line = line[:len(line)-1] // chop \n
|
||||||
|
lineno++
|
||||||
|
switch {
|
||||||
|
case line == "":
|
||||||
|
t.Fatalf("re2.txt:%d: unexpected blank line", lineno)
|
||||||
|
case line[0] == '#':
|
||||||
|
continue
|
||||||
|
case 'A' <= line[0] && line[0] <= 'Z':
|
||||||
|
// Test name.
|
||||||
|
t.Logf("%s\n", line)
|
||||||
|
continue
|
||||||
|
case line == "strings":
|
||||||
|
str = str[:0]
|
||||||
|
inStrings = true
|
||||||
|
case line == "regexps":
|
||||||
|
inStrings = false
|
||||||
|
case line[0] == '"':
|
||||||
|
q, err := strconv.Unquote(line)
|
||||||
|
if err != nil {
|
||||||
|
// Fatal because we'll get out of sync.
|
||||||
|
t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err)
|
||||||
|
}
|
||||||
|
if inStrings {
|
||||||
|
str = append(str, q)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Is a regexp.
|
||||||
|
if len(input) != 0 {
|
||||||
|
t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q)
|
||||||
|
}
|
||||||
|
re, err = tryCompile(q)
|
||||||
|
if err != nil {
|
||||||
|
if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" {
|
||||||
|
// We don't and likely never will support \C; keep going.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err)
|
||||||
|
if nfail++; nfail >= 100 {
|
||||||
|
t.Fatalf("stopping after %d errors", nfail)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
full := `\A(?:` + q + `)\z`
|
||||||
|
refull, err = tryCompile(full)
|
||||||
|
if err != nil {
|
||||||
|
// Fatal because q worked, so this should always work.
|
||||||
|
t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err)
|
||||||
|
}
|
||||||
|
input = str
|
||||||
|
case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
|
||||||
|
// A sequence of match results.
|
||||||
|
ncase++
|
||||||
|
if re == nil {
|
||||||
|
// Failed to compile: skip results.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(input) == 0 {
|
||||||
|
t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno)
|
||||||
|
}
|
||||||
|
var text string
|
||||||
|
text, input = input[0], input[1:]
|
||||||
|
if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
|
||||||
|
// RE2's \B considers every byte position,
|
||||||
|
// so it sees 'not word boundary' in the
|
||||||
|
// middle of UTF-8 sequences. This package
|
||||||
|
// only considers the positions between runes,
|
||||||
|
// so it disagrees. Skip those cases.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
res := strings.Split(line, ";")
|
||||||
|
if len(res) != 2 {
|
||||||
|
t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res))
|
||||||
|
}
|
||||||
|
// res[0] is full match
|
||||||
|
// res[1] is partial match
|
||||||
|
// Run partial match first; don't bother with full if partial fails.
|
||||||
|
have := re.FindStringSubmatchIndex(text)
|
||||||
|
want := parseResult(t, lineno, res[1])
|
||||||
|
if !same(have, want) {
|
||||||
|
t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want)
|
||||||
|
if nfail++; nfail >= 100 {
|
||||||
|
t.Fatalf("stopping after %d errors", nfail)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
have = refull.FindStringSubmatchIndex(text)
|
||||||
|
want = parseResult(t, lineno, res[0])
|
||||||
|
if !same(have, want) {
|
||||||
|
t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want)
|
||||||
|
if nfail++; nfail >= 100 {
|
||||||
|
t.Fatalf("stopping after %d errors", nfail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(input) != 0 {
|
||||||
|
t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input))
|
||||||
|
}
|
||||||
|
t.Logf("%d cases tested", ncase)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isSingleBytes(s string) bool {
|
||||||
|
for _, c := range s {
|
||||||
|
if c >= utf8.RuneSelf {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func tryCompile(s string) (re *Regexp, err os.Error) {
|
||||||
|
// Protect against panic during Compile.
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
err = fmt.Errorf("panic: %v", r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return Compile(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseResult(t *testing.T, lineno int, res string) []int {
|
||||||
|
// A single - indicates no match.
|
||||||
|
if res == "-" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Otherwise, a space-separated list of pairs.
|
||||||
|
n := 1
|
||||||
|
for j := 0; j < len(res); j++ {
|
||||||
|
if res[j] == ' ' {
|
||||||
|
n++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out := make([]int, 2*n)
|
||||||
|
i := 0
|
||||||
|
n = 0
|
||||||
|
for j := 0; j <= len(res); j++ {
|
||||||
|
if j == len(res) || res[j] == ' ' {
|
||||||
|
// Process a single pair. - means no submatch.
|
||||||
|
pair := res[i:j]
|
||||||
|
if pair == "-" {
|
||||||
|
out[n] = -1
|
||||||
|
out[n+1] = -1
|
||||||
|
} else {
|
||||||
|
k := strings.Index(pair, "-")
|
||||||
|
if k < 0 {
|
||||||
|
t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
|
||||||
|
}
|
||||||
|
lo, err1 := strconv.Atoi(pair[:k])
|
||||||
|
hi, err2 := strconv.Atoi(pair[k+1:])
|
||||||
|
if err1 != nil || err2 != nil || lo > hi {
|
||||||
|
t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
|
||||||
|
}
|
||||||
|
out[n] = lo
|
||||||
|
out[n+1] = hi
|
||||||
|
}
|
||||||
|
n += 2
|
||||||
|
i = j + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func same(x, y []int) bool {
|
||||||
|
if len(x) != len(y) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i, xi := range x {
|
||||||
|
if xi != y[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
@ -80,6 +80,23 @@ var findTests = []FindTest{
|
|||||||
{`data`, "daXY data", build(1, 5, 9)},
|
{`data`, "daXY data", build(1, 5, 9)},
|
||||||
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
|
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
|
||||||
{`zx+`, "zzx", build(1, 1, 3)},
|
{`zx+`, "zzx", build(1, 1, 3)},
|
||||||
|
{`ab$`, "abcab", build(1, 3, 5)},
|
||||||
|
{`(aa)*$`, "a", build(1, 1, 1, -1, -1)},
|
||||||
|
{`(?:.|(?:.a))`, "", nil},
|
||||||
|
{`(?:A(?:A|a))`, "Aa", build(1, 0, 2)},
|
||||||
|
{`(?:A|(?:A|a))`, "a", build(1, 0, 1)},
|
||||||
|
{`(a){0}`, "", build(1, 0, 0, -1, -1)},
|
||||||
|
{`(?-s)(?:(?:^).)`, "\n", nil},
|
||||||
|
{`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)},
|
||||||
|
{`(?:(?:^).)`, "\n", nil},
|
||||||
|
{`\b`, "x", build(2, 0, 0, 1, 1)},
|
||||||
|
{`\b`, "xx", build(2, 0, 0, 2, 2)},
|
||||||
|
{`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)},
|
||||||
|
{`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)},
|
||||||
|
{`\B`, "x", nil},
|
||||||
|
{`\B`, "xx", build(1, 1, 1)},
|
||||||
|
{`\B`, "x y", nil},
|
||||||
|
{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
|
||||||
|
|
||||||
// can backslash-escape any punctuation
|
// can backslash-escape any punctuation
|
||||||
{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
|
{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
|
||||||
|
BIN
src/pkg/exp/regexp/re2.txt.gz
Normal file
BIN
src/pkg/exp/regexp/re2.txt.gz
Normal file
Binary file not shown.
@ -84,6 +84,7 @@ type Regexp struct {
|
|||||||
prefixComplete bool // prefix is the entire regexp
|
prefixComplete bool // prefix is the entire regexp
|
||||||
prefixRune int // first rune in prefix
|
prefixRune int // first rune in prefix
|
||||||
cond syntax.EmptyOp // empty-width conditions required at start of match
|
cond syntax.EmptyOp // empty-width conditions required at start of match
|
||||||
|
numSubexp int
|
||||||
|
|
||||||
// cache of machines for running regexp
|
// cache of machines for running regexp
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
@ -102,13 +103,16 @@ func Compile(expr string) (*Regexp, os.Error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
maxCap := re.MaxCap()
|
||||||
|
re = re.Simplify()
|
||||||
prog, err := syntax.Compile(re)
|
prog, err := syntax.Compile(re)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
regexp := &Regexp{
|
regexp := &Regexp{
|
||||||
expr: expr,
|
expr: expr,
|
||||||
prog: prog,
|
prog: prog,
|
||||||
|
numSubexp: maxCap,
|
||||||
}
|
}
|
||||||
regexp.prefix, regexp.prefixComplete = prog.Prefix()
|
regexp.prefix, regexp.prefixComplete = prog.Prefix()
|
||||||
if regexp.prefix != "" {
|
if regexp.prefix != "" {
|
||||||
@ -161,9 +165,7 @@ func MustCompile(str string) *Regexp {
|
|||||||
|
|
||||||
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
|
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
|
||||||
func (re *Regexp) NumSubexp() int {
|
func (re *Regexp) NumSubexp() int {
|
||||||
// NumCap/2 because captures count ( and ) separately.
|
return re.numSubexp
|
||||||
// -1 because NumCap counts $0 but NumSubexp does not.
|
|
||||||
return re.prog.NumCap/2 - 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const endOfText = -1
|
const endOfText = -1
|
||||||
@ -175,6 +177,7 @@ type input interface {
|
|||||||
canCheckPrefix() bool // can we look ahead without losing info?
|
canCheckPrefix() bool // can we look ahead without losing info?
|
||||||
hasPrefix(re *Regexp) bool
|
hasPrefix(re *Regexp) bool
|
||||||
index(re *Regexp, pos int) int
|
index(re *Regexp, pos int) int
|
||||||
|
context(pos int) syntax.EmptyOp
|
||||||
}
|
}
|
||||||
|
|
||||||
// inputString scans a string.
|
// inputString scans a string.
|
||||||
@ -205,6 +208,17 @@ func (i *inputString) index(re *Regexp, pos int) int {
|
|||||||
return strings.Index(i.str[pos:], re.prefix)
|
return strings.Index(i.str[pos:], re.prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *inputString) context(pos int) syntax.EmptyOp {
|
||||||
|
r1, r2 := -1, -1
|
||||||
|
if pos > 0 && pos <= len(i.str) {
|
||||||
|
r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
|
||||||
|
}
|
||||||
|
if pos < len(i.str) {
|
||||||
|
r2, _ = utf8.DecodeRuneInString(i.str[pos:])
|
||||||
|
}
|
||||||
|
return syntax.EmptyOpContext(r1, r2)
|
||||||
|
}
|
||||||
|
|
||||||
// inputBytes scans a byte slice.
|
// inputBytes scans a byte slice.
|
||||||
type inputBytes struct {
|
type inputBytes struct {
|
||||||
str []byte
|
str []byte
|
||||||
@ -233,6 +247,17 @@ func (i *inputBytes) index(re *Regexp, pos int) int {
|
|||||||
return bytes.Index(i.str[pos:], re.prefixBytes)
|
return bytes.Index(i.str[pos:], re.prefixBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *inputBytes) context(pos int) syntax.EmptyOp {
|
||||||
|
r1, r2 := -1, -1
|
||||||
|
if pos > 0 && pos <= len(i.str) {
|
||||||
|
r1, _ = utf8.DecodeLastRune(i.str[:pos])
|
||||||
|
}
|
||||||
|
if pos < len(i.str) {
|
||||||
|
r2, _ = utf8.DecodeRune(i.str[pos:])
|
||||||
|
}
|
||||||
|
return syntax.EmptyOpContext(r1, r2)
|
||||||
|
}
|
||||||
|
|
||||||
// inputReader scans a RuneReader.
|
// inputReader scans a RuneReader.
|
||||||
type inputReader struct {
|
type inputReader struct {
|
||||||
r io.RuneReader
|
r io.RuneReader
|
||||||
@ -270,6 +295,10 @@ func (i *inputReader) index(re *Regexp, pos int) int {
|
|||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *inputReader) context(pos int) syntax.EmptyOp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
// LiteralPrefix returns a literal string that must begin any match
|
// LiteralPrefix returns a literal string that must begin any match
|
||||||
// of the regular expression re. It returns the boolean true if the
|
// of the regular expression re. It returns the boolean true if the
|
||||||
// literal string comprises the entire regular expression.
|
// literal string comprises the entire regular expression.
|
||||||
@ -458,6 +487,23 @@ func QuoteMeta(s string) string {
|
|||||||
return string(b[0:j])
|
return string(b[0:j])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The number of capture values in the program may correspond
|
||||||
|
// to fewer capturing expressions than are in the regexp.
|
||||||
|
// For example, "(a){0}" turns into an empty program, so the
|
||||||
|
// maximum capture in the program is 0 but we need to return
|
||||||
|
// an expression for \1. Pad appends -1s to the slice a as needed.
|
||||||
|
func (re *Regexp) pad(a []int) []int {
|
||||||
|
if a == nil {
|
||||||
|
// No match.
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := (1 + re.numSubexp) * 2
|
||||||
|
for len(a) < n {
|
||||||
|
a = append(a, -1)
|
||||||
|
}
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
|
||||||
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
|
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
|
||||||
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
|
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
|
||||||
var end int
|
var end int
|
||||||
@ -505,7 +551,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
|
|||||||
prevMatchEnd = matches[1]
|
prevMatchEnd = matches[1]
|
||||||
|
|
||||||
if accept {
|
if accept {
|
||||||
deliver(matches)
|
deliver(re.pad(matches))
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -580,9 +626,9 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
|
|||||||
if a == nil {
|
if a == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
ret := make([][]byte, len(a)/2)
|
ret := make([][]byte, 1+re.numSubexp)
|
||||||
for i := range ret {
|
for i := range ret {
|
||||||
if a[2*i] >= 0 {
|
if 2*i < len(a) && a[2*i] >= 0 {
|
||||||
ret[i] = b[a[2*i]:a[2*i+1]]
|
ret[i] = b[a[2*i]:a[2*i+1]]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -595,7 +641,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
|
|||||||
// in the package comment.
|
// in the package comment.
|
||||||
// A return value of nil indicates no match.
|
// A return value of nil indicates no match.
|
||||||
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
|
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
|
||||||
return re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
|
return re.pad(re.doExecute(newInputBytes(b), 0, re.prog.NumCap))
|
||||||
}
|
}
|
||||||
|
|
||||||
// FindStringSubmatch returns a slice of strings holding the text of the
|
// FindStringSubmatch returns a slice of strings holding the text of the
|
||||||
@ -608,9 +654,9 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
|
|||||||
if a == nil {
|
if a == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
ret := make([]string, len(a)/2)
|
ret := make([]string, 1+re.numSubexp)
|
||||||
for i := range ret {
|
for i := range ret {
|
||||||
if a[2*i] >= 0 {
|
if 2*i < len(a) && a[2*i] >= 0 {
|
||||||
ret[i] = s[a[2*i]:a[2*i+1]]
|
ret[i] = s[a[2*i]:a[2*i+1]]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -623,7 +669,7 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
|
|||||||
// 'Index' descriptions in the package comment.
|
// 'Index' descriptions in the package comment.
|
||||||
// A return value of nil indicates no match.
|
// A return value of nil indicates no match.
|
||||||
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
||||||
return re.doExecute(newInputString(s), 0, re.prog.NumCap)
|
return re.pad(re.doExecute(newInputString(s), 0, re.prog.NumCap))
|
||||||
}
|
}
|
||||||
|
|
||||||
// FindReaderSubmatchIndex returns a slice holding the index pairs
|
// FindReaderSubmatchIndex returns a slice holding the index pairs
|
||||||
@ -632,7 +678,7 @@ func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
|||||||
// by the 'Submatch' and 'Index' descriptions in the package comment. A
|
// by the 'Submatch' and 'Index' descriptions in the package comment. A
|
||||||
// return value of nil indicates no match.
|
// return value of nil indicates no match.
|
||||||
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
|
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
|
||||||
return re.doExecute(newInputReader(r), 0, re.prog.NumCap)
|
return re.pad(re.doExecute(newInputReader(r), 0, re.prog.NumCap))
|
||||||
}
|
}
|
||||||
|
|
||||||
const startSize = 10 // The size at which to start a slice in the 'All' routines.
|
const startSize = 10 // The size at which to start a slice in the 'All' routines.
|
||||||
|
@ -75,6 +75,7 @@ type compiler struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compile compiles the regexp into a program to be executed.
|
// Compile compiles the regexp into a program to be executed.
|
||||||
|
// The regexp should have been simplified already (returned from re.Simplify).
|
||||||
func Compile(re *Regexp) (*Prog, os.Error) {
|
func Compile(re *Regexp) (*Prog, os.Error) {
|
||||||
var c compiler
|
var c compiler
|
||||||
c.init()
|
c.init()
|
||||||
@ -90,7 +91,7 @@ func (c *compiler) init() {
|
|||||||
c.inst(InstFail)
|
c.inst(InstFail)
|
||||||
}
|
}
|
||||||
|
|
||||||
var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune}
|
var anyRuneNotNL = []int{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
|
||||||
var anyRune = []int{0, unicode.MaxRune}
|
var anyRune = []int{0, unicode.MaxRune}
|
||||||
|
|
||||||
func (c *compiler) compile(re *Regexp) frag {
|
func (c *compiler) compile(re *Regexp) frag {
|
||||||
@ -105,7 +106,7 @@ func (c *compiler) compile(re *Regexp) frag {
|
|||||||
}
|
}
|
||||||
var f frag
|
var f frag
|
||||||
for j := range re.Rune {
|
for j := range re.Rune {
|
||||||
f1 := c.rune(re.Rune[j : j+1])
|
f1 := c.rune(re.Rune[j:j+1], re.Flags)
|
||||||
if j == 0 {
|
if j == 0 {
|
||||||
f = f1
|
f = f1
|
||||||
} else {
|
} else {
|
||||||
@ -114,11 +115,11 @@ func (c *compiler) compile(re *Regexp) frag {
|
|||||||
}
|
}
|
||||||
return f
|
return f
|
||||||
case OpCharClass:
|
case OpCharClass:
|
||||||
return c.rune(re.Rune)
|
return c.rune(re.Rune, re.Flags)
|
||||||
case OpAnyCharNotNL:
|
case OpAnyCharNotNL:
|
||||||
return c.rune(anyRuneNotNL)
|
return c.rune(anyRuneNotNL, 0)
|
||||||
case OpAnyChar:
|
case OpAnyChar:
|
||||||
return c.rune(anyRune)
|
return c.rune(anyRune, 0)
|
||||||
case OpBeginLine:
|
case OpBeginLine:
|
||||||
return c.empty(EmptyBeginLine)
|
return c.empty(EmptyBeginLine)
|
||||||
case OpEndLine:
|
case OpEndLine:
|
||||||
@ -261,9 +262,16 @@ func (c *compiler) empty(op EmptyOp) frag {
|
|||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *compiler) rune(rune []int) frag {
|
func (c *compiler) rune(rune []int, flags Flags) frag {
|
||||||
f := c.inst(InstRune)
|
f := c.inst(InstRune)
|
||||||
c.p.Inst[f.i].Rune = rune
|
i := &c.p.Inst[f.i]
|
||||||
|
i.Rune = rune
|
||||||
|
flags &= FoldCase // only relevant flag is FoldCase
|
||||||
|
if len(rune) != 1 || unicode.SimpleFold(rune[0]) == rune[0] {
|
||||||
|
// and sometimes not even that
|
||||||
|
flags &^= FoldCase
|
||||||
|
}
|
||||||
|
i.Arg = uint32(flags)
|
||||||
f.out = patchList(f.i << 1)
|
f.out = patchList(f.i << 1)
|
||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
@ -419,8 +419,7 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
|
|||||||
// used or marked for reuse, and the slice space has been reused
|
// used or marked for reuse, and the slice space has been reused
|
||||||
// for out (len(out) <= start).
|
// for out (len(out) <= start).
|
||||||
//
|
//
|
||||||
// Invariant: sub[start:i] consists of regexps that all begin
|
// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
|
||||||
// with str as modified by strflags.
|
|
||||||
var ifirst *Regexp
|
var ifirst *Regexp
|
||||||
if i < len(sub) {
|
if i < len(sub) {
|
||||||
ifirst = p.leadingRegexp(sub[i])
|
ifirst = p.leadingRegexp(sub[i])
|
||||||
@ -441,7 +440,6 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
|
|||||||
} else {
|
} else {
|
||||||
// Construct factored form: prefix(suffix1|suffix2|...)
|
// Construct factored form: prefix(suffix1|suffix2|...)
|
||||||
prefix := first
|
prefix := first
|
||||||
|
|
||||||
for j := start; j < i; j++ {
|
for j := start; j < i; j++ {
|
||||||
reuse := j != start // prefix came from sub[start]
|
reuse := j != start // prefix came from sub[start]
|
||||||
sub[j] = p.removeLeadingRegexp(sub[j], reuse)
|
sub[j] = p.removeLeadingRegexp(sub[j], reuse)
|
||||||
@ -605,8 +603,10 @@ func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
|
|||||||
}
|
}
|
||||||
return re
|
return re
|
||||||
}
|
}
|
||||||
re.Op = OpEmptyMatch
|
if reuse {
|
||||||
return re
|
p.reuse(re)
|
||||||
|
}
|
||||||
|
return p.newRegexp(OpEmptyMatch)
|
||||||
}
|
}
|
||||||
|
|
||||||
func literalRegexp(s string, flags Flags) *Regexp {
|
func literalRegexp(s string, flags Flags) *Regexp {
|
||||||
@ -1053,18 +1053,18 @@ func mergeCharClass(dst, src *Regexp) {
|
|||||||
case OpCharClass:
|
case OpCharClass:
|
||||||
// src is simpler, so either literal or char class
|
// src is simpler, so either literal or char class
|
||||||
if src.Op == OpLiteral {
|
if src.Op == OpLiteral {
|
||||||
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
|
dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
|
||||||
} else {
|
} else {
|
||||||
dst.Rune = appendClass(dst.Rune, src.Rune)
|
dst.Rune = appendClass(dst.Rune, src.Rune)
|
||||||
}
|
}
|
||||||
case OpLiteral:
|
case OpLiteral:
|
||||||
// both literal
|
// both literal
|
||||||
if src.Rune[0] == dst.Rune[0] {
|
if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
dst.Op = OpCharClass
|
dst.Op = OpCharClass
|
||||||
dst.Rune = append(dst.Rune, dst.Rune[0])
|
dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
|
||||||
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
|
dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1544,6 +1544,14 @@ func cleanClass(rp *[]int) []int {
|
|||||||
return r[:w]
|
return r[:w]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// appendLiteral returns the result of appending the literal x to the class r.
|
||||||
|
func appendLiteral(r []int, x int, flags Flags) []int {
|
||||||
|
if flags&FoldCase != 0 {
|
||||||
|
return appendFoldedRange(r, x, x)
|
||||||
|
}
|
||||||
|
return appendRange(r, x, x)
|
||||||
|
}
|
||||||
|
|
||||||
// appendRange returns the result of appending the range lo-hi to the class r.
|
// appendRange returns the result of appending the range lo-hi to the class r.
|
||||||
func appendRange(r []int, lo, hi int) []int {
|
func appendRange(r []int, lo, hi int) []int {
|
||||||
// Expand last range or next to last range if it overlaps or abuts.
|
// Expand last range or next to last range if it overlaps or abuts.
|
||||||
|
@ -162,6 +162,18 @@ var parseTests = []struct {
|
|||||||
// Factoring.
|
// Factoring.
|
||||||
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
|
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
|
||||||
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
|
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
|
||||||
|
|
||||||
|
// Bug fixes.
|
||||||
|
{`(?:.)`, `dot{}`},
|
||||||
|
{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
|
||||||
|
{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
|
||||||
|
{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
|
||||||
|
{`(?:A|a)`, `litfold{A}`},
|
||||||
|
{`A|(?:A|a)`, `litfold{A}`},
|
||||||
|
{`(?s).`, `dot{}`},
|
||||||
|
{`(?-s).`, `dnl{}`},
|
||||||
|
{`(?:(?:^).)`, `cat{bol{}dot{}}`},
|
||||||
|
{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
|
||||||
}
|
}
|
||||||
|
|
||||||
const testFlags = MatchNL | PerlX | UnicodeGroups
|
const testFlags = MatchNL | PerlX | UnicodeGroups
|
||||||
|
@ -3,6 +3,7 @@ package syntax
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Compiled program.
|
// Compiled program.
|
||||||
@ -41,6 +42,41 @@ const (
|
|||||||
EmptyNoWordBoundary
|
EmptyNoWordBoundary
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// EmptyOpContext returns the zero-width assertions
|
||||||
|
// satisfied at the position between the runes r1 and r2.
|
||||||
|
// Passing r1 == -1 indicates that the position is
|
||||||
|
// at the beginning of the text.
|
||||||
|
// Passing r2 == -1 indicates that the position is
|
||||||
|
// at the end of the text.
|
||||||
|
func EmptyOpContext(r1, r2 int) EmptyOp {
|
||||||
|
var op EmptyOp
|
||||||
|
if r1 < 0 {
|
||||||
|
op |= EmptyBeginText | EmptyBeginLine
|
||||||
|
}
|
||||||
|
if r1 == '\n' {
|
||||||
|
op |= EmptyBeginLine
|
||||||
|
}
|
||||||
|
if r2 < 0 {
|
||||||
|
op |= EmptyEndText
|
||||||
|
}
|
||||||
|
if r2 == '\n' {
|
||||||
|
op |= EmptyEndLine
|
||||||
|
}
|
||||||
|
if IsWordChar(r1) != IsWordChar(r2) {
|
||||||
|
op |= EmptyWordBoundary
|
||||||
|
} else {
|
||||||
|
op |= EmptyNoWordBoundary
|
||||||
|
}
|
||||||
|
return op
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsWordChar reports whether r is consider a ``word character''
|
||||||
|
// during the evaluation of the \b and \B zero-width assertions.
|
||||||
|
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
||||||
|
func IsWordChar(r int) bool {
|
||||||
|
return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
||||||
|
}
|
||||||
|
|
||||||
// An Inst is a single instruction in a regular expression program.
|
// An Inst is a single instruction in a regular expression program.
|
||||||
type Inst struct {
|
type Inst struct {
|
||||||
Op InstOp
|
Op InstOp
|
||||||
@ -79,7 +115,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
|
|||||||
|
|
||||||
// Have prefix; gather characters.
|
// Have prefix; gather characters.
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
for i.Op == InstRune && len(i.Rune) == 1 {
|
for i.Op == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
|
||||||
buf.WriteRune(i.Rune[0])
|
buf.WriteRune(i.Rune[0])
|
||||||
i = p.skipNop(i.Out)
|
i = p.skipNop(i.Out)
|
||||||
}
|
}
|
||||||
@ -116,9 +152,19 @@ func (i *Inst) MatchRune(r int) bool {
|
|||||||
rune := i.Rune
|
rune := i.Rune
|
||||||
|
|
||||||
// Special case: single-rune slice is from literal string, not char class.
|
// Special case: single-rune slice is from literal string, not char class.
|
||||||
// TODO: Case folding.
|
|
||||||
if len(rune) == 1 {
|
if len(rune) == 1 {
|
||||||
return r == rune[0]
|
r0 := rune[0]
|
||||||
|
if r == r0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if Flags(i.Arg)&FoldCase != 0 {
|
||||||
|
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
|
||||||
|
if r == r1 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Peek at the first few pairs.
|
// Peek at the first few pairs.
|
||||||
@ -232,6 +278,10 @@ func dumpInst(b *bytes.Buffer, i *Inst) {
|
|||||||
// shouldn't happen
|
// shouldn't happen
|
||||||
bw(b, "rune <nil>")
|
bw(b, "rune <nil>")
|
||||||
}
|
}
|
||||||
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
|
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
|
||||||
|
if Flags(i.Arg)&FoldCase != 0 {
|
||||||
|
bw(b, "/i")
|
||||||
|
}
|
||||||
|
bw(b, " -> ", u32(i.Out))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -76,6 +76,16 @@ var compileTests = []struct {
|
|||||||
4 alt -> 3, 6
|
4 alt -> 3, 6
|
||||||
5* alt -> 1, 3
|
5* alt -> 1, 3
|
||||||
6 match
|
6 match
|
||||||
|
`},
|
||||||
|
{"A[Aa]", ` 0 fail
|
||||||
|
1* rune "A" -> 2
|
||||||
|
2 rune "A"/i -> 3
|
||||||
|
3 match
|
||||||
|
`},
|
||||||
|
{"(?:(?:^).)", ` 0 fail
|
||||||
|
1* empty 4 -> 2
|
||||||
|
2 rune "\x00\t\v\U0010ffff" -> 3
|
||||||
|
3 match
|
||||||
`},
|
`},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -282,3 +282,17 @@ func escape(b *bytes.Buffer, r int, force bool) {
|
|||||||
b.WriteString(`}`)
|
b.WriteString(`}`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MaxCap walks the regexp to find the maximum capture index.
|
||||||
|
func (re *Regexp) MaxCap() int {
|
||||||
|
m := 0
|
||||||
|
if re.Op == OpCapture {
|
||||||
|
m = re.Cap
|
||||||
|
}
|
||||||
|
for _, sub := range re.Sub {
|
||||||
|
if n := sub.MaxCap(); m < n {
|
||||||
|
m = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user