exp/regexp: bug fixes and RE2 tests

Also add exp/regexp to build (forgot before). At this point I am very confident in exp/regexp's behavior. It should be usable as a drop-in replacement for regexp now. Later CLs could introduce a CompilePOSIX to get at traditional POSIX ``extended regular expressions'' as in egrep and also an re.MatchLongest method to change the matching mode to leftmost longest instead of leftmost first. On the other hand, I expect very few people to use either. R=r, r, gustavo CC=golang-dev https://golang.org/cl/4990041
2024-11-24 22:57:57 -07:00 · 2011-09-07 15:48:06 -04:00 · 2011-09-07 15:48:06 -04:00 · 08ae1a5a23
commit 08ae1a5a23
parent a2c2c87439
12 changed files with 474 additions and 58 deletions
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@ -81,6 +81,7 @@ DIRS=\
 	exp/gui\
 	exp/gui/x11\
 	exp/norm\
 	exp/regexp\
 	exp/regexp/syntax\
 	exp/template/html\
 	expvar\
--- a/src/pkg/exp/regexp/exec.go
+++ b/src/pkg/exp/regexp/exec.go
@ -90,23 +90,12 @@ func (m *machine) match(i input, pos int) bool {
 	if rune != endOfText {
 		rune1, width1 = i.step(pos + width)
 	}
 	// TODO: Let caller specify the initial flag setting.
 	// For now assume pos == 0 is beginning of text and
 	// pos != 0 is not even beginning of line.
 	// TODO: Word boundary.
 	var flag syntax.EmptyOp
 	if pos == 0 {
-		flag = syntax.EmptyBeginText | syntax.EmptyBeginLine
+		flag = syntax.EmptyOpContext(-1, rune)
 	} else {
 		flag = i.context(pos)
 	}
 	// Update flag using lookahead rune.
 	if rune1 == '\n' {
 		flag |= syntax.EmptyEndLine
 	}
 	if rune1 == endOfText {
 		flag |= syntax.EmptyEndText
 	}
 	for {
 		if len(runq.dense) == 0 {
 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
@ -134,17 +123,7 @@ func (m *machine) match(i input, pos int) bool {
 			}
 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
 		}
-		// TODO: word boundary
+		flag = syntax.EmptyOpContext(rune, rune1)
 		flag = 0
 		if rune == '\n' {
 			flag |= syntax.EmptyBeginLine
 		}
 		if rune1 == '\n' {
 			flag |= syntax.EmptyEndLine
 		}
 		if rune1 == endOfText {
 			flag |= syntax.EmptyEndText
 		}
 		m.step(runq, nextq, pos, pos+width, rune, flag)
 		if width == 0 {
 			break
--- a/src/pkg/exp/regexp/exec_test.go
+++ b/src/pkg/exp/regexp/exec_test.go
@ -0,0 +1,271 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package regexp
 import (
 	"bufio"
 	"compress/gzip"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 	"testing"
 	"utf8"
 )
 // TestRE2 tests this package's regexp API against test cases
 // considered during RE2's exhaustive tests, which run all possible
 // regexps over a given set of atoms and operators, up to a given
 // complexity, over all possible strings over a given alphabet,
 // up to a given size.  Rather than try to link with RE2, we read a
 // log file containing the test cases and the expected matches.
 // The log file, re2.txt, is generated by running 'make exhaustive-log'
 // in the open source RE2 distribution.  http://code.google.com/p/re2/
 //
 // The test file format is a sequence of stanzas like:
 //
 //	strings
 //	"abc"
 //	"123x"
 //	regexps
 //	"[a-z]+"
 //	0-3;0-3
 //	-;-
 //	"([0-9])([0-9])([0-9])"
 //	-;-
 //	-;0-3 0-1 1-2 2-3
 //
 // The stanza begins by defining a set of strings, quoted
 // using Go double-quote syntax, one per line.  Then the
 // regexps section gives a sequence of regexps to run on
 // the strings.  In the block that follows a regexp, each line
 // gives the semicolon-separated match results of running
 // the regexp on the corresponding string.
 // Each match result is either a single -, meaning no match, or a
 // space-separated sequence of pairs giving the match and
 // submatch indices.  An unmatched subexpression formats
 // its pair as a single - (not illustrated above).  For now
 // each regexp run produces two match results, one for a
 // ``full match'' that restricts the regexp to matching the entire
 // string or nothing, and one for a ``partial match'' that gives
 // the leftmost first match found in the string.
 //
 // Lines beginning with # are comments.  Lines beginning with
 // a capital letter are test names printed during RE2's test suite
 // and are echoed into t but otherwise ignored.
 //
 // At time of writing, re2.txt is 32 MB but compresses to 760 kB,
 // so we store re2.txt.gz in the repository and decompress it on the fly.
 //
 func TestRE2(t *testing.T) {
 	if testing.Short() {
 		t.Log("skipping TestRE2 during short test")
 		return
 	}
 	f, err := os.Open("re2.txt.gz")
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	gz, err := gzip.NewReader(f)
 	if err != nil {
 		t.Fatalf("decompress re2.txt.gz: %v", err)
 	}
 	defer gz.Close()
 	lineno := 0
 	r := bufio.NewReader(gz)
 	var (
 		str       []string
 		input     []string
 		inStrings bool
 		re        *Regexp
 		refull    *Regexp
 		nfail     int
 		ncase     int
 	)
 	for {
 		line, err := r.ReadString('\n')
 		if err != nil {
 			if err == os.EOF {
 				break
 			}
 			t.Fatalf("re2.txt:%d: %v", lineno, err)
 		}
 		line = line[:len(line)-1] // chop \n
 		lineno++
 		switch {
 		case line == "":
 			t.Fatalf("re2.txt:%d: unexpected blank line", lineno)
 		case line[0] == '#':
 			continue
 		case 'A' <= line[0] && line[0] <= 'Z':
 			// Test name.
 			t.Logf("%s\n", line)
 			continue
 		case line == "strings":
 			str = str[:0]
 			inStrings = true
 		case line == "regexps":
 			inStrings = false
 		case line[0] == '"':
 			q, err := strconv.Unquote(line)
 			if err != nil {
 				// Fatal because we'll get out of sync.
 				t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err)
 			}
 			if inStrings {
 				str = append(str, q)
 				continue
 			}
 			// Is a regexp.
 			if len(input) != 0 {
 				t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q)
 			}
 			re, err = tryCompile(q)
 			if err != nil {
 				if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" {
 					// We don't and likely never will support \C; keep going.
 					continue
 				}
 				t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err)
 				if nfail++; nfail >= 100 {
 					t.Fatalf("stopping after %d errors", nfail)
 				}
 				continue
 			}
 			full := `\A(?:` + q + `)\z`
 			refull, err = tryCompile(full)
 			if err != nil {
 				// Fatal because q worked, so this should always work.
 				t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err)
 			}
 			input = str
 		case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
 			// A sequence of match results.
 			ncase++
 			if re == nil {
 				// Failed to compile: skip results.
 				continue
 			}
 			if len(input) == 0 {
 				t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno)
 			}
 			var text string
 			text, input = input[0], input[1:]
 			if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
 				// RE2's \B considers every byte position,
 				// so it sees 'not word boundary' in the
 				// middle of UTF-8 sequences.  This package
 				// only considers the positions between runes,
 				// so it disagrees.  Skip those cases.
 				continue
 			}
 			res := strings.Split(line, ";")
 			if len(res) != 2 {
 				t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res))
 			}
 			// res[0] is full match
 			// res[1] is partial match
 			// Run partial match first; don't bother with full if partial fails.
 			have := re.FindStringSubmatchIndex(text)
 			want := parseResult(t, lineno, res[1])
 			if !same(have, want) {
 				t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want)
 				if nfail++; nfail >= 100 {
 					t.Fatalf("stopping after %d errors", nfail)
 				}
 				continue
 			}
 			have = refull.FindStringSubmatchIndex(text)
 			want = parseResult(t, lineno, res[0])
 			if !same(have, want) {
 				t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want)
 				if nfail++; nfail >= 100 {
 					t.Fatalf("stopping after %d errors", nfail)
 				}
 			}
 		default:
 			t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line)
 		}
 	}
 	if len(input) != 0 {
 		t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input))
 	}
 	t.Logf("%d cases tested", ncase)
 }
 func isSingleBytes(s string) bool {
 	for _, c := range s {
 		if c >= utf8.RuneSelf {
 			return false
 		}
 	}
 	return true
 }
 func tryCompile(s string) (re *Regexp, err os.Error) {
 	// Protect against panic during Compile.
 	defer func() {
 		if r := recover(); r != nil {
 			err = fmt.Errorf("panic: %v", r)
 		}
 	}()
 	return Compile(s)
 }
 func parseResult(t *testing.T, lineno int, res string) []int {
 	// A single - indicates no match.
 	if res == "-" {
 		return nil
 	}
 	// Otherwise, a space-separated list of pairs.
 	n := 1
 	for j := 0; j < len(res); j++ {
 		if res[j] == ' ' {
 			n++
 		}
 	}
 	out := make([]int, 2*n)
 	i := 0
 	n = 0
 	for j := 0; j <= len(res); j++ {
 		if j == len(res) || res[j] == ' ' {
 			// Process a single pair.  - means no submatch.
 			pair := res[i:j]
 			if pair == "-" {
 				out[n] = -1
 				out[n+1] = -1
 			} else {
 				k := strings.Index(pair, "-")
 				if k < 0 {
 					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
 				}
 				lo, err1 := strconv.Atoi(pair[:k])
 				hi, err2 := strconv.Atoi(pair[k+1:])
 				if err1 != nil || err2 != nil || lo > hi {
 					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
 				}
 				out[n] = lo
 				out[n+1] = hi
 			}
 			n += 2
 			i = j + 1
 		}
 	}
 	return out
 }
 func same(x, y []int) bool {
 	if len(x) != len(y) {
 		return false
 	}
 	for i, xi := range x {
 		if xi != y[i] {
 			return false
 		}
 	}
 	return true
 }
--- a/src/pkg/exp/regexp/find_test.go
+++ b/src/pkg/exp/regexp/find_test.go
@ -80,6 +80,23 @@ var findTests = []FindTest{
 	{`data`, "daXY data", build(1, 5, 9)},
 	{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
 	{`zx+`, "zzx", build(1, 1, 3)},
 	{`ab$`, "abcab", build(1, 3, 5)},
 	{`(aa)*$`, "a", build(1, 1, 1, -1, -1)},
 	{`(?:.|(?:.a))`, "", nil},
 	{`(?:A(?:A|a))`, "Aa", build(1, 0, 2)},
 	{`(?:A|(?:A|a))`, "a", build(1, 0, 1)},
 	{`(a){0}`, "", build(1, 0, 0, -1, -1)},
 	{`(?-s)(?:(?:^).)`, "\n", nil},
 	{`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)},
 	{`(?:(?:^).)`, "\n", nil},
 	{`\b`, "x", build(2, 0, 0, 1, 1)},
 	{`\b`, "xx", build(2, 0, 0, 2, 2)},
 	{`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)},
 	{`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)},
 	{`\B`, "x", nil},
 	{`\B`, "xx", build(1, 1, 1)},
 	{`\B`, "x y", nil},
 	{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
 	// can backslash-escape any punctuation
 	{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
--- a/src/pkg/exp/regexp/re2.txt.gz
+++ b/src/pkg/exp/regexp/re2.txt.gz
--- a/src/pkg/exp/regexp/regexp.go
+++ b/src/pkg/exp/regexp/regexp.go
@ -84,6 +84,7 @@ type Regexp struct {
 	prefixComplete bool           // prefix is the entire regexp
 	prefixRune     int            // first rune in prefix
 	cond           syntax.EmptyOp // empty-width conditions required at start of match
 	numSubexp      int
 	// cache of machines for running regexp
 	mu      sync.Mutex
@ -102,13 +103,16 @@ func Compile(expr string) (*Regexp, os.Error) {
 	if err != nil {
 		return nil, err
 	}
 	maxCap := re.MaxCap()
 	re = re.Simplify()
 	prog, err := syntax.Compile(re)
 	if err != nil {
 		return nil, err
 	}
 	regexp := &Regexp{
-		expr: expr,
+		expr:      expr,
-		prog: prog,
+		prog:      prog,
 		numSubexp: maxCap,
 	}
 	regexp.prefix, regexp.prefixComplete = prog.Prefix()
 	if regexp.prefix != "" {
@ -161,9 +165,7 @@ func MustCompile(str string) *Regexp {
 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 func (re *Regexp) NumSubexp() int {
-	// NumCap/2 because captures count ( and ) separately.
+	return re.numSubexp
 	// -1 because NumCap counts $0 but NumSubexp does not.
 	return re.prog.NumCap/2 - 1
 }
 const endOfText = -1
@ -175,6 +177,7 @@ type input interface {
 	canCheckPrefix() bool               // can we look ahead without losing info?
 	hasPrefix(re *Regexp) bool
 	index(re *Regexp, pos int) int
 	context(pos int) syntax.EmptyOp
 }
 // inputString scans a string.
@ -205,6 +208,17 @@ func (i *inputString) index(re *Regexp, pos int) int {
 	return strings.Index(i.str[pos:], re.prefix)
 }
 func (i *inputString) context(pos int) syntax.EmptyOp {
 	r1, r2 := -1, -1
 	if pos > 0 && pos <= len(i.str) {
 		r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 	}
 	if pos < len(i.str) {
 		r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 	}
 	return syntax.EmptyOpContext(r1, r2)
 }
 // inputBytes scans a byte slice.
 type inputBytes struct {
 	str []byte
@ -233,6 +247,17 @@ func (i *inputBytes) index(re *Regexp, pos int) int {
 	return bytes.Index(i.str[pos:], re.prefixBytes)
 }
 func (i *inputBytes) context(pos int) syntax.EmptyOp {
 	r1, r2 := -1, -1
 	if pos > 0 && pos <= len(i.str) {
 		r1, _ = utf8.DecodeLastRune(i.str[:pos])
 	}
 	if pos < len(i.str) {
 		r2, _ = utf8.DecodeRune(i.str[pos:])
 	}
 	return syntax.EmptyOpContext(r1, r2)
 }
 // inputReader scans a RuneReader.
 type inputReader struct {
 	r     io.RuneReader
@ -270,6 +295,10 @@ func (i *inputReader) index(re *Regexp, pos int) int {
 	return -1
 }
 func (i *inputReader) context(pos int) syntax.EmptyOp {
 	return 0
 }
 // LiteralPrefix returns a literal string that must begin any match
 // of the regular expression re.  It returns the boolean true if the
 // literal string comprises the entire regular expression.
@ -458,6 +487,23 @@ func QuoteMeta(s string) string {
 	return string(b[0:j])
 }
 // The number of capture values in the program may correspond
 // to fewer capturing expressions than are in the regexp.
 // For example, "(a){0}" turns into an empty program, so the
 // maximum capture in the program is 0 but we need to return
 // an expression for \1.  Pad appends -1s to the slice a as needed.
 func (re *Regexp) pad(a []int) []int {
 	if a == nil {
 		// No match.
 		return nil
 	}
 	n := (1 + re.numSubexp) * 2
 	for len(a) < n {
 		a = append(a, -1)
 	}
 	return a
 }
 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 	var end int
@ -505,7 +551,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 		prevMatchEnd = matches[1]
 		if accept {
-			deliver(matches)
+			deliver(re.pad(matches))
 			i++
 		}
 	}
@ -580,9 +626,9 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 	if a == nil {
 		return nil
 	}
-	ret := make([][]byte, len(a)/2)
+	ret := make([][]byte, 1+re.numSubexp)
 	for i := range ret {
-		if a[2*i] >= 0 {
+		if 2*i < len(a) && a[2*i] >= 0 {
 			ret[i] = b[a[2*i]:a[2*i+1]]
 		}
 	}
@ -595,7 +641,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 // in the package comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
-	return re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
+	return re.pad(re.doExecute(newInputBytes(b), 0, re.prog.NumCap))
 }
 // FindStringSubmatch returns a slice of strings holding the text of the
@ -608,9 +654,9 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
 	if a == nil {
 		return nil
 	}
-	ret := make([]string, len(a)/2)
+	ret := make([]string, 1+re.numSubexp)
 	for i := range ret {
-		if a[2*i] >= 0 {
+		if 2*i < len(a) && a[2*i] >= 0 {
 			ret[i] = s[a[2*i]:a[2*i+1]]
 		}
 	}
@ -623,7 +669,7 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
 // 'Index' descriptions in the package comment.
 // A return value of nil indicates no match.
 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
-	return re.doExecute(newInputString(s), 0, re.prog.NumCap)
+	return re.pad(re.doExecute(newInputString(s), 0, re.prog.NumCap))
 }
 // FindReaderSubmatchIndex returns a slice holding the index pairs
@ -632,7 +678,7 @@ func (re *Regexp) FindStringSubmatchIndex(s string) []int {
 // by the 'Submatch' and 'Index' descriptions in the package comment.  A
 // return value of nil indicates no match.
 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
-	return re.doExecute(newInputReader(r), 0, re.prog.NumCap)
+	return re.pad(re.doExecute(newInputReader(r), 0, re.prog.NumCap))
 }
 const startSize = 10 // The size at which to start a slice in the 'All' routines.
--- a/src/pkg/exp/regexp/syntax/compile.go
+++ b/src/pkg/exp/regexp/syntax/compile.go
@ -75,6 +75,7 @@ type compiler struct {
 }
 // Compile compiles the regexp into a program to be executed.
 // The regexp should have been simplified already (returned from re.Simplify).
 func Compile(re *Regexp) (*Prog, os.Error) {
 	var c compiler
 	c.init()
@ -90,7 +91,7 @@ func (c *compiler) init() {
 	c.inst(InstFail)
 }
-var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune}
+var anyRuneNotNL = []int{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
 var anyRune = []int{0, unicode.MaxRune}
 func (c *compiler) compile(re *Regexp) frag {
@ -105,7 +106,7 @@ func (c *compiler) compile(re *Regexp) frag {
 		}
 		var f frag
 		for j := range re.Rune {
-			f1 := c.rune(re.Rune[j : j+1])
+			f1 := c.rune(re.Rune[j:j+1], re.Flags)
 			if j == 0 {
 				f = f1
 			} else {
@ -114,11 +115,11 @@ func (c *compiler) compile(re *Regexp) frag {
 		}
 		return f
 	case OpCharClass:
-		return c.rune(re.Rune)
+		return c.rune(re.Rune, re.Flags)
 	case OpAnyCharNotNL:
-		return c.rune(anyRuneNotNL)
+		return c.rune(anyRuneNotNL, 0)
 	case OpAnyChar:
-		return c.rune(anyRune)
+		return c.rune(anyRune, 0)
 	case OpBeginLine:
 		return c.empty(EmptyBeginLine)
 	case OpEndLine:
@ -261,9 +262,16 @@ func (c *compiler) empty(op EmptyOp) frag {
 	return f
 }
-func (c *compiler) rune(rune []int) frag {
+func (c *compiler) rune(rune []int, flags Flags) frag {
 	f := c.inst(InstRune)
-	c.p.Inst[f.i].Rune = rune
+	i := &c.p.Inst[f.i]
 	i.Rune = rune
 	flags &= FoldCase // only relevant flag is FoldCase
 	if len(rune) != 1 || unicode.SimpleFold(rune[0]) == rune[0] {
 		// and sometimes not even that
 		flags &^= FoldCase
 	}
 	i.Arg = uint32(flags)
 	f.out = patchList(f.i << 1)
 	return f
 }
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@ -419,8 +419,7 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
 		// used or marked for reuse, and the slice space has been reused
 		// for out (len(out) <= start).
 		//
-		// Invariant: sub[start:i] consists of regexps that all begin
+		// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
 		// with str as modified by strflags.
 		var ifirst *Regexp
 		if i < len(sub) {
 			ifirst = p.leadingRegexp(sub[i])
@ -441,7 +440,6 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
 		} else {
 			// Construct factored form: prefix(suffix1|suffix2|...)
 			prefix := first
 			for j := start; j < i; j++ {
 				reuse := j != start // prefix came from sub[start] 
 				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
@ -605,8 +603,10 @@ func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
 		}
 		return re
 	}
-	re.Op = OpEmptyMatch
+	if reuse {
-	return re
+		p.reuse(re)
 	}
 	return p.newRegexp(OpEmptyMatch)
 }
 func literalRegexp(s string, flags Flags) *Regexp {
@ -1053,18 +1053,18 @@ func mergeCharClass(dst, src *Regexp) {
 	case OpCharClass:
 		// src is simpler, so either literal or char class
 		if src.Op == OpLiteral {
-			dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
+			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
 		} else {
 			dst.Rune = appendClass(dst.Rune, src.Rune)
 		}
 	case OpLiteral:
 		// both literal
-		if src.Rune[0] == dst.Rune[0] {
+		if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
 			break
 		}
 		dst.Op = OpCharClass
-		dst.Rune = append(dst.Rune, dst.Rune[0])
+		dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
-		dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
+		dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
 	}
 }
@ -1544,6 +1544,14 @@ func cleanClass(rp *[]int) []int {
 	return r[:w]
 }
 // appendLiteral returns the result of appending the literal x to the class r.
 func appendLiteral(r []int, x int, flags Flags) []int {
 	if flags&FoldCase != 0 {
 		return appendFoldedRange(r, x, x)
 	}
 	return appendRange(r, x, x)
 }
 // appendRange returns the result of appending the range lo-hi to the class r.
 func appendRange(r []int, lo, hi int) []int {
 	// Expand last range or next to last range if it overlaps or abuts.
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@ -162,6 +162,18 @@ var parseTests = []struct {
 	// Factoring.
 	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
 	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
 	// Bug fixes.
 	{`(?:.)`, `dot{}`},
 	{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
 	{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
 	{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
 	{`(?:A|a)`, `litfold{A}`},
 	{`A|(?:A|a)`, `litfold{A}`},
 	{`(?s).`, `dot{}`},
 	{`(?-s).`, `dnl{}`},
 	{`(?:(?:^).)`, `cat{bol{}dot{}}`},
 	{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
 }
 const testFlags = MatchNL | PerlX | UnicodeGroups
--- a/src/pkg/exp/regexp/syntax/prog.go
+++ b/src/pkg/exp/regexp/syntax/prog.go
@ -3,6 +3,7 @@ package syntax
 import (
 	"bytes"
 	"strconv"
 	"unicode"
 )
 // Compiled program.
@ -41,6 +42,41 @@ const (
 	EmptyNoWordBoundary
 )
 // EmptyOpContext returns the zero-width assertions
 // satisfied at the position between the runes r1 and r2.
 // Passing r1 == -1 indicates that the position is
 // at the beginning of the text.
 // Passing r2 == -1 indicates that the position is
 // at the end of the text.
 func EmptyOpContext(r1, r2 int) EmptyOp {
 	var op EmptyOp
 	if r1 < 0 {
 		op |= EmptyBeginText | EmptyBeginLine
 	}
 	if r1 == '\n' {
 		op |= EmptyBeginLine
 	}
 	if r2 < 0 {
 		op |= EmptyEndText
 	}
 	if r2 == '\n' {
 		op |= EmptyEndLine
 	}
 	if IsWordChar(r1) != IsWordChar(r2) {
 		op |= EmptyWordBoundary
 	} else {
 		op |= EmptyNoWordBoundary
 	}
 	return op
 }
 // IsWordChar reports whether r is consider a ``word character''
 // during the evaluation of the \b and \B zero-width assertions.
 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
 func IsWordChar(r int) bool {
 	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
 }
 // An Inst is a single instruction in a regular expression program.
 type Inst struct {
 	Op   InstOp
@ -79,7 +115,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
 	// Have prefix; gather characters.
 	var buf bytes.Buffer
-	for i.Op == InstRune && len(i.Rune) == 1 {
+	for i.Op == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
 		buf.WriteRune(i.Rune[0])
 		i = p.skipNop(i.Out)
 	}
@ -116,9 +152,19 @@ func (i *Inst) MatchRune(r int) bool {
 	rune := i.Rune
 	// Special case: single-rune slice is from literal string, not char class.
 	// TODO: Case folding.
 	if len(rune) == 1 {
-		return r == rune[0]
+		r0 := rune[0]
 		if r == r0 {
 			return true
 		}
 		if Flags(i.Arg)&FoldCase != 0 {
 			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
 				if r == r1 {
 					return true
 				}
 			}
 		}
 		return false
 	}
 	// Peek at the first few pairs.
@ -232,6 +278,10 @@ func dumpInst(b *bytes.Buffer, i *Inst) {
 			// shouldn't happen
 			bw(b, "rune <nil>")
 		}
-		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
+		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
 		if Flags(i.Arg)&FoldCase != 0 {
 			bw(b, "/i")
 		}
 		bw(b, " -> ", u32(i.Out))
 	}
 }
--- a/src/pkg/exp/regexp/syntax/prog_test.go
+++ b/src/pkg/exp/regexp/syntax/prog_test.go
@ -76,6 +76,16 @@ var compileTests = []struct {
  4	alt -> 3, 6
  5*	alt -> 1, 3
  6	match
 `},
 	{"A[Aa]", `  0	fail
  1*	rune "A" -> 2
  2	rune "A"/i -> 3
  3	match
 `},
 	{"(?:(?:^).)", `  0	fail
  1*	empty 4 -> 2
  2	rune "\x00\t\v\U0010ffff" -> 3
  3	match
 `},
 }
--- a/src/pkg/exp/regexp/syntax/regexp.go
+++ b/src/pkg/exp/regexp/syntax/regexp.go
@ -282,3 +282,17 @@ func escape(b *bytes.Buffer, r int, force bool) {
 		b.WriteString(`}`)
 	}
 }
 // MaxCap walks the regexp to find the maximum capture index.
 func (re *Regexp) MaxCap() int {
 	m := 0
 	if re.Op == OpCapture {
 		m = re.Cap
 	}
 	for _, sub := range re.Sub {
 		if n := sub.MaxCap(); m < n {
 			m = n
 		}
 	}
 	return m
 }