1
0
mirror of https://github.com/golang/go synced 2024-10-05 06:11:21 -06:00
go/src/pkg/exp/regexp/exec_test.go

272 lines
7.1 KiB
Go
Raw Normal View History

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"bufio"
"compress/gzip"
"fmt"
"os"
"strconv"
"strings"
"testing"
"utf8"
)
// TestRE2 tests this package's regexp API against test cases
// considered during RE2's exhaustive tests, which run all possible
// regexps over a given set of atoms and operators, up to a given
// complexity, over all possible strings over a given alphabet,
// up to a given size. Rather than try to link with RE2, we read a
// log file containing the test cases and the expected matches.
// The log file, re2.txt, is generated by running 'make exhaustive-log'
// in the open source RE2 distribution. http://code.google.com/p/re2/
//
// The test file format is a sequence of stanzas like:
//
// strings
// "abc"
// "123x"
// regexps
// "[a-z]+"
// 0-3;0-3
// -;-
// "([0-9])([0-9])([0-9])"
// -;-
// -;0-3 0-1 1-2 2-3
//
// The stanza begins by defining a set of strings, quoted
// using Go double-quote syntax, one per line. Then the
// regexps section gives a sequence of regexps to run on
// the strings. In the block that follows a regexp, each line
// gives the semicolon-separated match results of running
// the regexp on the corresponding string.
// Each match result is either a single -, meaning no match, or a
// space-separated sequence of pairs giving the match and
// submatch indices. An unmatched subexpression formats
// its pair as a single - (not illustrated above). For now
// each regexp run produces two match results, one for a
// ``full match'' that restricts the regexp to matching the entire
// string or nothing, and one for a ``partial match'' that gives
// the leftmost first match found in the string.
//
// Lines beginning with # are comments. Lines beginning with
// a capital letter are test names printed during RE2's test suite
// and are echoed into t but otherwise ignored.
//
// At time of writing, re2.txt is 32 MB but compresses to 760 kB,
// so we store re2.txt.gz in the repository and decompress it on the fly.
//
func TestRE2(t *testing.T) {
if testing.Short() {
t.Log("skipping TestRE2 during short test")
return
}
f, err := os.Open("re2.txt.gz")
if err != nil {
t.Fatal(err)
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
t.Fatalf("decompress re2.txt.gz: %v", err)
}
defer gz.Close()
lineno := 0
r := bufio.NewReader(gz)
var (
str []string
input []string
inStrings bool
re *Regexp
refull *Regexp
nfail int
ncase int
)
for {
line, err := r.ReadString('\n')
if err != nil {
if err == os.EOF {
break
}
t.Fatalf("re2.txt:%d: %v", lineno, err)
}
line = line[:len(line)-1] // chop \n
lineno++
switch {
case line == "":
t.Fatalf("re2.txt:%d: unexpected blank line", lineno)
case line[0] == '#':
continue
case 'A' <= line[0] && line[0] <= 'Z':
// Test name.
t.Logf("%s\n", line)
continue
case line == "strings":
str = str[:0]
inStrings = true
case line == "regexps":
inStrings = false
case line[0] == '"':
q, err := strconv.Unquote(line)
if err != nil {
// Fatal because we'll get out of sync.
t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err)
}
if inStrings {
str = append(str, q)
continue
}
// Is a regexp.
if len(input) != 0 {
t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q)
}
re, err = tryCompile(q)
if err != nil {
if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" {
// We don't and likely never will support \C; keep going.
continue
}
t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
full := `\A(?:` + q + `)\z`
refull, err = tryCompile(full)
if err != nil {
// Fatal because q worked, so this should always work.
t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err)
}
input = str
case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
// A sequence of match results.
ncase++
if re == nil {
// Failed to compile: skip results.
continue
}
if len(input) == 0 {
t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno)
}
var text string
text, input = input[0], input[1:]
if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
// RE2's \B considers every byte position,
// so it sees 'not word boundary' in the
// middle of UTF-8 sequences. This package
// only considers the positions between runes,
// so it disagrees. Skip those cases.
continue
}
res := strings.Split(line, ";")
if len(res) != 2 {
t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res))
}
// res[0] is full match
// res[1] is partial match
// Run partial match first; don't bother with full if partial fails.
have := re.FindStringSubmatchIndex(text)
want := parseResult(t, lineno, res[1])
if !same(have, want) {
t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
have = refull.FindStringSubmatchIndex(text)
want = parseResult(t, lineno, res[0])
if !same(have, want) {
t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
}
default:
t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line)
}
}
if len(input) != 0 {
t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input))
}
t.Logf("%d cases tested", ncase)
}
func isSingleBytes(s string) bool {
for _, c := range s {
if c >= utf8.RuneSelf {
return false
}
}
return true
}
func tryCompile(s string) (re *Regexp, err os.Error) {
// Protect against panic during Compile.
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic: %v", r)
}
}()
return Compile(s)
}
func parseResult(t *testing.T, lineno int, res string) []int {
// A single - indicates no match.
if res == "-" {
return nil
}
// Otherwise, a space-separated list of pairs.
n := 1
for j := 0; j < len(res); j++ {
if res[j] == ' ' {
n++
}
}
out := make([]int, 2*n)
i := 0
n = 0
for j := 0; j <= len(res); j++ {
if j == len(res) || res[j] == ' ' {
// Process a single pair. - means no submatch.
pair := res[i:j]
if pair == "-" {
out[n] = -1
out[n+1] = -1
} else {
k := strings.Index(pair, "-")
if k < 0 {
t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
}
lo, err1 := strconv.Atoi(pair[:k])
hi, err2 := strconv.Atoi(pair[k+1:])
if err1 != nil || err2 != nil || lo > hi {
t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
}
out[n] = lo
out[n+1] = hi
}
n += 2
i = j + 1
}
}
return out
}
func same(x, y []int) bool {
if len(x) != len(y) {
return false
}
for i, xi := range x {
if xi != y[i] {
return false
}
}
return true
}