// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp import ( "bufio" "compress/gzip" "fmt" "os" "strconv" "strings" "testing" "utf8" ) // TestRE2 tests this package's regexp API against test cases // considered during RE2's exhaustive tests, which run all possible // regexps over a given set of atoms and operators, up to a given // complexity, over all possible strings over a given alphabet, // up to a given size. Rather than try to link with RE2, we read a // log file containing the test cases and the expected matches. // The log file, re2.txt, is generated by running 'make exhaustive-log' // in the open source RE2 distribution. http://code.google.com/p/re2/ // // The test file format is a sequence of stanzas like: // // strings // "abc" // "123x" // regexps // "[a-z]+" // 0-3;0-3 // -;- // "([0-9])([0-9])([0-9])" // -;- // -;0-3 0-1 1-2 2-3 // // The stanza begins by defining a set of strings, quoted // using Go double-quote syntax, one per line. Then the // regexps section gives a sequence of regexps to run on // the strings. In the block that follows a regexp, each line // gives the semicolon-separated match results of running // the regexp on the corresponding string. // Each match result is either a single -, meaning no match, or a // space-separated sequence of pairs giving the match and // submatch indices. An unmatched subexpression formats // its pair as a single - (not illustrated above). For now // each regexp run produces two match results, one for a // ``full match'' that restricts the regexp to matching the entire // string or nothing, and one for a ``partial match'' that gives // the leftmost first match found in the string. // // Lines beginning with # are comments. Lines beginning with // a capital letter are test names printed during RE2's test suite // and are echoed into t but otherwise ignored. // // At time of writing, re2.txt is 32 MB but compresses to 760 kB, // so we store re2.txt.gz in the repository and decompress it on the fly. // func TestRE2(t *testing.T) { if testing.Short() { t.Log("skipping TestRE2 during short test") return } f, err := os.Open("re2.txt.gz") if err != nil { t.Fatal(err) } defer f.Close() gz, err := gzip.NewReader(f) if err != nil { t.Fatalf("decompress re2.txt.gz: %v", err) } defer gz.Close() lineno := 0 r := bufio.NewReader(gz) var ( str []string input []string inStrings bool re *Regexp refull *Regexp nfail int ncase int ) for { line, err := r.ReadString('\n') if err != nil { if err == os.EOF { break } t.Fatalf("re2.txt:%d: %v", lineno, err) } line = line[:len(line)-1] // chop \n lineno++ switch { case line == "": t.Fatalf("re2.txt:%d: unexpected blank line", lineno) case line[0] == '#': continue case 'A' <= line[0] && line[0] <= 'Z': // Test name. t.Logf("%s\n", line) continue case line == "strings": str = str[:0] inStrings = true case line == "regexps": inStrings = false case line[0] == '"': q, err := strconv.Unquote(line) if err != nil { // Fatal because we'll get out of sync. t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err) } if inStrings { str = append(str, q) continue } // Is a regexp. if len(input) != 0 { t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q) } re, err = tryCompile(q) if err != nil { if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" { // We don't and likely never will support \C; keep going. continue } t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } continue } full := `\A(?:` + q + `)\z` refull, err = tryCompile(full) if err != nil { // Fatal because q worked, so this should always work. t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err) } input = str case line[0] == '-' || '0' <= line[0] && line[0] <= '9': // A sequence of match results. ncase++ if re == nil { // Failed to compile: skip results. continue } if len(input) == 0 { t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno) } var text string text, input = input[0], input[1:] if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { // RE2's \B considers every byte position, // so it sees 'not word boundary' in the // middle of UTF-8 sequences. This package // only considers the positions between runes, // so it disagrees. Skip those cases. continue } res := strings.Split(line, ";") if len(res) != 2 { t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res)) } // res[0] is full match // res[1] is partial match // Run partial match first; don't bother with full if partial fails. have := re.FindStringSubmatchIndex(text) want := parseResult(t, lineno, res[1]) if !same(have, want) { t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } continue } have = refull.FindStringSubmatchIndex(text) want = parseResult(t, lineno, res[0]) if !same(have, want) { t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } } default: t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line) } } if len(input) != 0 { t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input)) } t.Logf("%d cases tested", ncase) } func isSingleBytes(s string) bool { for _, c := range s { if c >= utf8.RuneSelf { return false } } return true } func tryCompile(s string) (re *Regexp, err os.Error) { // Protect against panic during Compile. defer func() { if r := recover(); r != nil { err = fmt.Errorf("panic: %v", r) } }() return Compile(s) } func parseResult(t *testing.T, lineno int, res string) []int { // A single - indicates no match. if res == "-" { return nil } // Otherwise, a space-separated list of pairs. n := 1 for j := 0; j < len(res); j++ { if res[j] == ' ' { n++ } } out := make([]int, 2*n) i := 0 n = 0 for j := 0; j <= len(res); j++ { if j == len(res) || res[j] == ' ' { // Process a single pair. - means no submatch. pair := res[i:j] if pair == "-" { out[n] = -1 out[n+1] = -1 } else { k := strings.Index(pair, "-") if k < 0 { t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) } lo, err1 := strconv.Atoi(pair[:k]) hi, err2 := strconv.Atoi(pair[k+1:]) if err1 != nil || err2 != nil || lo > hi { t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) } out[n] = lo out[n+1] = hi } n += 2 i = j + 1 } } return out } func same(x, y []int) bool { if len(x) != len(y) { return false } for i, xi := range x { if xi != y[i] { return false } } return true }