exp/regexp/syntax: finish Regexp manipulation

Except for the inevitable bug fixes, the Regexp code is done. R=sam.thorogood, r CC=golang-dev https://golang.org/cl/4635082
2024-11-24 23:57:57 -07:00 · 2011-06-30 10:26:22 -04:00 · 2011-06-30 10:26:22 -04:00 · b4cae4aee2
commit b4cae4aee2
parent a809abafa5
5 changed files with 713 additions and 39 deletions
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@ -106,8 +106,6 @@ func (p *parser) reuse(re *Regexp) {
 // push pushes the regexp re onto the parse stack and returns the regexp.
 func (p *parser) push(re *Regexp) *Regexp {
 	// TODO: compute simple
 	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
 		// Single rune.
 		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
@ -250,7 +248,7 @@ func (p *parser) concat() *Regexp {
 		return p.push(p.newRegexp(OpEmptyMatch))
 	}
-	return p.collapse(subs, OpConcat)
+	return p.push(p.collapse(subs, OpConcat))
 }
 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
@ -276,7 +274,7 @@ func (p *parser) alternate() *Regexp {
 		return p.push(p.newRegexp(OpNoMatch))
 	}
-	return p.collapse(subs, OpAlternate)
+	return p.push(p.collapse(subs, OpAlternate))
 }
 // cleanAlt cleans re for eventual inclusion in an alternation.
@ -302,13 +300,13 @@ func cleanAlt(re *Regexp) {
 	}
 }
-// collapse pushes the result of applying op to sub
+// collapse returns the result of applying op to sub.
-// onto the stack.  If sub contains op nodes, they all
+// If sub contains op nodes, they all get hoisted up
-// get flattened into a single node.
+// so that there is never a concat of a concat or an
-// sub points into p.stack so it cannot be kept.
+// alternate of an alternate.
 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
 	if len(subs) == 1 {
-		return p.push(subs[0])
+		return subs[0]
 	}
 	re := p.newRegexp(op)
 	re.Sub = re.Sub0[:0]
@ -320,7 +318,295 @@ func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
 			re.Sub = append(re.Sub, sub)
 		}
 	}
-	return p.push(re)
+	if op == OpAlternate {
 		re.Sub = p.factor(re.Sub, re.Flags)
 		if len(re.Sub) == 1 {
 			old := re
 			re = re.Sub[0]
 			p.reuse(old)
 		}
 	}
 	return re
 }
 // factor factors common prefixes from the alternation list sub.
 // It returns a replacement list that reuses the same storage and
 // frees (passes to p.reuse) any removed *Regexps.
 //
 // For example,
 //     ABC|ABD|AEF|BCX|BCY
 // simplifies by literal prefix extraction to
 //     A(B(C|D)|EF)|BC(X|Y)
 // which simplifies by character class introduction to
 //     A(B[CD]|EF)|BC[XY]
 //
 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
 	if len(sub) < 2 {
 		return sub
 	}
 	// Round 1: Factor out common literal prefixes.
 	var str []int
 	var strflags Flags
 	start := 0
 	out := sub[:0]
 	for i := 0; i <= len(sub); i++ {
 		// Invariant: the Regexps that were in sub[0:start] have been
 		// used or marked for reuse, and the slice space has been reused
 		// for out (len(out) <= start).
 		//
 		// Invariant: sub[start:i] consists of regexps that all begin
 		// with str as modified by strflags.
 		var istr []int
 		var iflags Flags
 		if i < len(sub) {
 			istr, iflags = p.leadingString(sub[i])
 			if iflags == strflags {
 				same := 0
 				for same < len(str) && same < len(istr) && str[same] == istr[same] {
 					same++
 				}
 				if same > 0 {
 					// Matches at least one rune in current range.
 					// Keep going around.
 					str = str[:same]
 					continue
 				}
 			}
 		}
 		// Found end of a run with common leading literal string:
 		// sub[start:i] all begin with str[0:len(str)], but sub[i]
 		// does not even begin with str[0].
 		//
 		// Factor out common string and append factored expression to out.
 		if i == start {
 			// Nothing to do - run of length 0.
 		} else if i == start+1 {
 			// Just one: don't bother factoring.
 			out = append(out, sub[start])
 		} else {
 			// Construct factored form: prefix(suffix1|suffix2|...)
 			prefix := p.newRegexp(OpLiteral)
 			prefix.Flags = strflags
 			prefix.Rune = append(prefix.Rune[:0], str...)
 			for j := start; j < i; j++ {
 				sub[j] = p.removeLeadingString(sub[j], len(str))
 			}
 			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 			re := p.newRegexp(OpConcat)
 			re.Sub = append(re.Sub[:0], prefix, suffix)
 			out = append(out, re)
 		}
 		// Prepare for next iteration.
 		start = i
 		str = istr
 		strflags = iflags
 	}
 	sub = out
 	// Round 2: Factor out common complex prefixes,
 	// just the first piece of each concatenation,
 	// whatever it is.  This is good enough a lot of the time.
 	start = 0
 	out = sub[:0]
 	var first *Regexp
 	for i := 0; i <= len(sub); i++ {
 		// Invariant: the Regexps that were in sub[0:start] have been
 		// used or marked for reuse, and the slice space has been reused
 		// for out (len(out) <= start).
 		//
 		// Invariant: sub[start:i] consists of regexps that all begin
 		// with str as modified by strflags.
 		var ifirst *Regexp
 		if i < len(sub) {
 			ifirst = p.leadingRegexp(sub[i])
 			if first != nil && first.Equal(ifirst) {
 				continue
 			}
 		}
 		// Found end of a run with common leading regexp:
 		// sub[start:i] all begin with first but sub[i] does not.
 		//
 		// Factor out common regexp and append factored expression to out.
 		if i == start {
 			// Nothing to do - run of length 0.
 		} else if i == start+1 {
 			// Just one: don't bother factoring.
 			out = append(out, sub[start])
 		} else {
 			// Construct factored form: prefix(suffix1|suffix2|...)
 			prefix := first
 			for j := start; j < i; j++ {
 				reuse := j != start // prefix came from sub[start] 
 				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
 			}
 			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 			re := p.newRegexp(OpConcat)
 			re.Sub = append(re.Sub[:0], prefix, suffix)
 			out = append(out, re)
 		}
 		// Prepare for next iteration.
 		start = i
 		first = ifirst
 	}
 	sub = out
 	// Round 3: Collapse runs of single literals into character classes.
 	start = 0
 	out = sub[:0]
 	for i := 0; i <= len(sub); i++ {
 		// Invariant: the Regexps that were in sub[0:start] have been
 		// used or marked for reuse, and the slice space has been reused
 		// for out (len(out) <= start).
 		//
 		// Invariant: sub[start:i] consists of regexps that are either
 		// literal runes or character classes.
 		if i < len(sub) && isCharClass(sub[i]) {
 			continue
 		}
 		// sub[i] is not a char or char class;
 		// emit char class for sub[start:i]...
 		if i == start {
 			// Nothing to do - run of length 0.
 		} else if i == start+1 {
 			out = append(out, sub[start])
 		} else {
 			// Make new char class.
 			// Start with most complex regexp in sub[start].
 			max := start
 			for j := start + 1; j < i; j++ {
 				if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
 					max = j
 				}
 			}
 			sub[start], sub[max] = sub[max], sub[start]
 			for j := start + 1; j < i; j++ {
 				mergeCharClass(sub[start], sub[j])
 				p.reuse(sub[j])
 			}
 			cleanAlt(sub[start])
 			out = append(out, sub[start])
 		}
 		// ... and then emit sub[i].
 		if i < len(sub) {
 			out = append(out, sub[i])
 		}
 		start = i + 1
 	}
 	sub = out
 	// Round 4: Collapse runs of empty matches into a single empty match.
 	start = 0
 	out = sub[:0]
 	for i := range sub {
 		if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
 			continue
 		}
 		out = append(out, sub[i])
 	}
 	sub = out
 	return sub
 }
 // leadingString returns the leading literal string that re begins with.
 // The string refers to storage in re or its children.
 func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
 	if re.Op == OpConcat && len(re.Sub) > 0 {
 		re = re.Sub[0]
 	}
 	if re.Op != OpLiteral {
 		return nil, 0
 	}
 	return re.Rune, re.Flags & FoldCase
 }
 // removeLeadingString removes the first n leading runes
 // from the beginning of re.  It returns the replacement for re.
 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
 	if re.Op == OpConcat && len(re.Sub) > 0 {
 		// Removing a leading string in a concatenation
 		// might simplify the concatenation.
 		sub := re.Sub[0]
 		sub = p.removeLeadingString(sub, n)
 		re.Sub[0] = sub
 		if sub.Op == OpEmptyMatch {
 			p.reuse(sub)
 			switch len(re.Sub) {
 			case 0, 1:
 				// Impossible but handle.
 				re.Op = OpEmptyMatch
 				re.Sub = nil
 			case 2:
 				old := re
 				re = re.Sub[1]
 				p.reuse(old)
 			default:
 				copy(re.Sub, re.Sub[1:])
 				re.Sub = re.Sub[:len(re.Sub)-1]
 			}
 		}
 		return re
 	}
 	if re.Op == OpLiteral {
 		re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
 		if len(re.Rune) == 0 {
 			re.Op = OpEmptyMatch
 		}
 	}
 	return re
 }
 // leadingRegexp returns the leading regexp that re begins with.
 // The regexp refers to storage in re or its children.
 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
 	if re.Op == OpEmptyMatch {
 		return nil
 	}
 	if re.Op == OpConcat && len(re.Sub) > 0 {
 		sub := re.Sub[0]
 		if sub.Op == OpEmptyMatch {
 			return nil
 		}
 		return sub
 	}
 	return re
 }
 // removeLeadingRegexp removes the leading regexp in re.
 // It returns the replacement for re.
 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
 	if re.Op == OpConcat && len(re.Sub) > 0 {
 		if reuse {
 			p.reuse(re.Sub[0])
 		}
 		re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
 		switch len(re.Sub) {
 		case 0:
 			re.Op = OpEmptyMatch
 			re.Sub = nil
 		case 1:
 			old := re
 			re = re.Sub[0]
 			p.reuse(old)
 		}
 		return re
 	}
 	re.Op = OpEmptyMatch
 	return re
 }
 func literalRegexp(s string, flags Flags) *Regexp {
@ -752,6 +1038,36 @@ func (p *parser) parseVerticalBar() os.Error {
 	return nil
 }
 // mergeCharClass makes dst = dst|src.
 // The caller must ensure that dst.Op >= src.Op,
 // to reduce the amount of copying.
 func mergeCharClass(dst, src *Regexp) {
 	switch dst.Op {
 	case OpAnyChar:
 		// src doesn't add anything.
 	case OpAnyCharNotNL:
 		// src might add \n
 		if matchRune(src, '\n') {
 			dst.Op = OpAnyChar
 		}
 	case OpCharClass:
 		// src is simpler, so either literal or char class
 		if src.Op == OpLiteral {
 			dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
 		} else {
 			dst.Rune = appendClass(dst.Rune, src.Rune)
 		}
 	case OpLiteral:
 		// both literal
 		if src.Rune[0] == dst.Rune[0] {
 			break
 		}
 		dst.Op = OpCharClass
 		dst.Rune = append(dst.Rune, dst.Rune[0])
 		dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
 	}
 }
 // If the top of the stack is an element followed by an opVerticalBar
 // swapVerticalBar swaps the two and returns true.
 // Otherwise it returns false.
@ -767,30 +1083,7 @@ func (p *parser) swapVerticalBar() bool {
 			re1, re3 = re3, re1
 			p.stack[n-3] = re3
 		}
-		switch re3.Op {
+		mergeCharClass(re3, re1)
 		case OpAnyChar:
 			// re1 doesn't add anything.
 		case OpAnyCharNotNL:
 			// re1 might add \n
 			if matchRune(re1, '\n') {
 				re3.Op = OpAnyChar
 			}
 		case OpCharClass:
 			// re1 is simpler, so either literal or char class
 			if re1.Op == OpLiteral {
 				re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
 			} else {
 				re3.Rune = appendClass(re3.Rune, re1.Rune)
 			}
 		case OpLiteral:
 			// both literal
 			if re1.Rune[0] == re3.Rune[0] {
 				break
 			}
 			re3.Op = OpCharClass
 			re3.Rune = append(re3.Rune, re3.Rune[0])
 			re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
 		}
 		p.reuse(re1)
 		p.stack = p.stack[:n-1]
 		return true
@ -1432,10 +1725,11 @@ func negateClass(r []int) []int {
 		}
 		nextLo = hi + 1
 	}
 	r = r[:w]
 	if nextLo <= unicode.MaxRune {
 		// It's possible for the negation to have one more
 		// range - this one - than the original class, so use append.
-		r = append(r[:w], nextLo, unicode.MaxRune)
+		r = append(r, nextLo, unicode.MaxRune)
 	}
 	return r
 }
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@ -39,8 +39,7 @@ var parseTests = []struct {
 	{`a{2,3}?`, `nrep{2,3 lit{a}}`},
 	{`a{2,}?`, `nrep{2,-1 lit{a}}`},
 	{``, `emp{}`},
-	//	{ `|`, `emp{}` },  // alt{emp{}emp{}} but got factored
+	{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
 	{`|`, `alt{emp{}emp{}}`},
 	{`|x|`, `alt{emp{}lit{x}emp{}}`},
 	{`.`, `dot{}`},
 	{`^`, `bol{}`},
@ -64,6 +63,9 @@ var parseTests = []struct {
 	{`\-`, `lit{-}`},
 	{`-`, `lit{-}`},
 	{`\_`, `lit{_}`},
 	{`abc`, `str{abc}`},
 	{`abc|def`, `alt{str{abc}str{def}}`},
 	{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
 	// Posix and Perl extensions
 	{`[[:lower:]]`, `cc{0x61-0x7a}`},
@ -156,6 +158,10 @@ var parseTests = []struct {
 	// Strings
 	{`abcde`, `str{abcde}`},
 	{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
 	// Factoring.
 	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
 	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
 }
 const testFlags = MatchNL | PerlX | UnicodeGroups
--- a/src/pkg/exp/regexp/syntax/regexp.go
+++ b/src/pkg/exp/regexp/syntax/regexp.go
@ -60,6 +60,59 @@ const (
 const opPseudo Op = 128 // where pseudo-ops start
 // Equal returns true if x and y have identical structure.
 func (x *Regexp) Equal(y *Regexp) bool {
 	if x == nil || y == nil {
 		return x == y
 	}
 	if x.Op != y.Op {
 		return false
 	}
 	switch x.Op {
 	case OpEndText:
 		// The parse flags remember whether this is \z or \Z.
 		if x.Flags&WasDollar != y.Flags&WasDollar {
 			return false
 		}
 	case OpLiteral, OpCharClass:
 		if len(x.Rune) != len(y.Rune) {
 			return false
 		}
 		for i, r := range x.Rune {
 			if r != y.Rune[i] {
 				return false
 			}
 		}
 	case OpAlternate, OpConcat:
 		if len(x.Sub) != len(y.Sub) {
 			return false
 		}
 		for i, sub := range x.Sub {
 			if !sub.Equal(y.Sub[i]) {
 				return false
 			}
 		}
 	case OpStar, OpPlus, OpQuest:
 		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
 			return false
 		}
 	case OpRepeat:
 		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
 			return false
 		}
 	case OpCapture:
 		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
 			return false
 		}
 	}
 	return true
 }
 // writeRegexp writes the Perl syntax for the regular expression re to b.
 func writeRegexp(b *bytes.Buffer, re *Regexp) {
 	switch re.Op {
@ -70,16 +123,24 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
 	case OpEmptyMatch:
 		b.WriteString(`(?:)`)
 	case OpLiteral:
 		if re.Flags&FoldCase != 0 {
 			b.WriteString(`(?i:`)
 		}
 		for _, r := range re.Rune {
 			escape(b, r, false)
 		}
 		if re.Flags&FoldCase != 0 {
 			b.WriteString(`)`)
 		}
 	case OpCharClass:
 		if len(re.Rune)%2 != 0 {
 			b.WriteString(`[invalid char class]`)
 			break
 		}
 		b.WriteRune('[')
-		if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
+		if len(re.Rune) == 0 {
 			b.WriteString(`^\x00-\x{10FFFF}`)
 		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
 			// Contains 0 and MaxRune.  Probably a negated class.
 			// Print the gaps.
 			b.WriteRune('^')
@ -126,7 +187,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
 		} else {
 			b.WriteRune('(')
 		}
-		writeRegexp(b, re.Sub[0])
+		if re.Sub[0].Op != OpEmptyMatch {
 			writeRegexp(b, re.Sub[0])
 		}
 		b.WriteRune(')')
 	case OpStar, OpPlus, OpQuest, OpRepeat:
 		if sub := re.Sub[0]; sub.Op > OpCapture {
@ -205,6 +268,15 @@ func escape(b *bytes.Buffer, r int, force bool) {
 	case '\v':
 		b.WriteString(`\v`)
 	default:
 		if r < 0x100 {
 			b.WriteString(`\x`)
 			s := strconv.Itob(r, 16)
 			if len(s) == 1 {
 				b.WriteRune('0')
 			}
 			b.WriteString(s)
 			break
 		}
 		b.WriteString(`\x{`)
 		b.WriteString(strconv.Itob(r, 16))
 		b.WriteString(`}`)
--- a/src/pkg/exp/regexp/syntax/simplify.go
+++ b/src/pkg/exp/regexp/syntax/simplify.go
@ -0,0 +1,151 @@
 // Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package syntax
 // Simplify returns a regexp equivalent to re but without counted repetitions
 // and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
 // The resulting regexp will execute correctly but its string representation
 // will not produce the same parse tree, because capturing parentheses
 // may have been duplicated or removed.  For example, the simplified form
 // for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
 // The returned regexp may share structure with or be the original.
 func (re *Regexp) Simplify() *Regexp {
 	if re == nil {
 		return nil
 	}
 	switch re.Op {
 	case OpCapture, OpConcat, OpAlternate:
 		// Simplify children, building new Regexp if children change.
 		nre := re
 		for i, sub := range re.Sub {
 			nsub := sub.Simplify()
 			if nre == re && nsub != sub {
 				// Start a copy.
 				nre = new(Regexp)
 				*nre = *re
 				nre.Rune = nil
 				nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
 			}
 			if nre != re {
 				nre.Sub = append(nre.Sub, nsub)
 			}
 		}
 		return nre
 	case OpStar, OpPlus, OpQuest:
 		sub := re.Sub[0].Simplify()
 		return simplify1(re.Op, re.Flags, sub, re)
 	case OpRepeat:
 		// Special special case: x{0} matches the empty string
 		// and doesn't even need to consider x.
 		if re.Min == 0 && re.Max == 0 {
 			return &Regexp{Op: OpEmptyMatch}
 		}
 		// The fun begins.
 		sub := re.Sub[0].Simplify()
 		// x{n,} means at least n matches of x.
 		if re.Max == -1 {
 			// Special case: x{0,} is x*.
 			if re.Min == 0 {
 				return simplify1(OpStar, re.Flags, sub, nil)
 			}
 			// Special case: x{1,} is x+.
 			if re.Min == 1 {
 				return simplify1(OpPlus, re.Flags, sub, nil)
 			}
 			// General case: x{4,} is xxxx+.
 			nre := &Regexp{Op: OpConcat}
 			nre.Sub = nre.Sub0[:0]
 			for i := 0; i < re.Min-1; i++ {
 				nre.Sub = append(nre.Sub, sub)
 			}
 			nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
 			return nre
 		}
 		// Special case x{0} handled above.
 		// Special case: x{1} is just x.
 		if re.Min == 1 && re.Max == 1 {
 			return sub
 		}
 		// General case: x{n,m} means n copies of x and m copies of x?
 		// The machine will do less work if we nest the final m copies,
 		// so that x{2,5} = xx(x(x(x)?)?)?
 		// Build leading prefix: xx.
 		var prefix *Regexp
 		if re.Min > 0 {
 			prefix = &Regexp{Op: OpConcat}
 			prefix.Sub = prefix.Sub0[:0]
 			for i := 0; i < re.Min; i++ {
 				prefix.Sub = append(prefix.Sub, sub)
 			}
 		}
 		// Build and attach suffix: (x(x(x)?)?)?
 		if re.Max > re.Min {
 			suffix := simplify1(OpQuest, re.Flags, sub, nil)
 			for i := re.Min + 1; i < re.Max; i++ {
 				nre2 := &Regexp{Op: OpConcat}
 				nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
 				suffix = simplify1(OpQuest, re.Flags, nre2, nil)
 			}
 			if prefix == nil {
 				return suffix
 			}
 			prefix.Sub = append(prefix.Sub, suffix)
 		}
 		if prefix != nil {
 			return prefix
 		}
 		// Some degenerate case like min > max or min < max < 0.
 		// Handle as impossible match.
 		return &Regexp{Op: OpNoMatch}
 	}
 	return re
 }
 // simplify1 implements Simplify for the unary OpStar,
 // OpPlus, and OpQuest operators.  It returns the simple regexp
 // equivalent to
 //
 //	Regexp{Op: op, Flags: flags, Sub: {sub}}
 //
 // under the assumption that sub is already simple, and
 // without first allocating that structure.  If the regexp
 // to be returned turns out to be equivalent to re, simplify1
 // returns re instead.
 //
 // simplify1 is factored out of Simplify because the implementation
 // for other operators generates these unary expressions.
 // Letting them call simplify1 makes sure the expressions they
 // generate are simple.
 func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
 	// Special case: repeat the empty string as much as
 	// you want, but it's still the empty string.
 	if sub.Op == OpEmptyMatch {
 		return sub
 	}
 	// The operators are idempotent if the flags match.
 	if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
 		return sub
 	}
 	if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
 		return re
 	}
 	re = &Regexp{Op: op, Flags: flags}
 	re.Sub = append(re.Sub0[:0], sub)
 	return re
 }
--- a/src/pkg/exp/regexp/syntax/simplify_test.go
+++ b/src/pkg/exp/regexp/syntax/simplify_test.go
@ -0,0 +1,151 @@
 // Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package syntax
 import "testing"
 var simplifyTests = []struct {
 	Regexp string
 	Simple string
 }{
 	// Already-simple constructs
 	{`a`, `a`},
 	{`ab`, `ab`},
 	{`a|b`, `[a-b]`},
 	{`ab|cd`, `ab|cd`},
 	{`(ab)*`, `(ab)*`},
 	{`(ab)+`, `(ab)+`},
 	{`(ab)?`, `(ab)?`},
 	{`.`, `.`},
 	{`^`, `^`},
 	{`$`, `$`},
 	{`[ac]`, `[ac]`},
 	{`[^ac]`, `[^ac]`},
 	// Posix character classes
 	{`[[:alnum:]]`, `[0-9A-Za-z]`},
 	{`[[:alpha:]]`, `[A-Za-z]`},
 	{`[[:blank:]]`, `[\t ]`},
 	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
 	{`[[:digit:]]`, `[0-9]`},
 	{`[[:graph:]]`, `[!-~]`},
 	{`[[:lower:]]`, `[a-z]`},
 	{`[[:print:]]`, `[ -~]`},
 	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
 	{`[[:space:]]`, `[\t-\r ]`},
 	{`[[:upper:]]`, `[A-Z]`},
 	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
 	// Perl character classes
 	{`\d`, `[0-9]`},
 	{`\s`, `[\t-\n\f-\r ]`},
 	{`\w`, `[0-9A-Z_a-z]`},
 	{`\D`, `[^0-9]`},
 	{`\S`, `[^\t-\n\f-\r ]`},
 	{`\W`, `[^0-9A-Z_a-z]`},
 	{`[\d]`, `[0-9]`},
 	{`[\s]`, `[\t-\n\f-\r ]`},
 	{`[\w]`, `[0-9A-Z_a-z]`},
 	{`[\D]`, `[^0-9]`},
 	{`[\S]`, `[^\t-\n\f-\r ]`},
 	{`[\W]`, `[^0-9A-Z_a-z]`},
 	// Posix repetitions
 	{`a{1}`, `a`},
 	{`a{2}`, `aa`},
 	{`a{5}`, `aaaaa`},
 	{`a{0,1}`, `a?`},
 	// The next three are illegible because Simplify inserts (?:)
 	// parens instead of () parens to avoid creating extra
 	// captured subexpressions.  The comments show a version with fewer parens.
 	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
 	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
 	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
 	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
 	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
 	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
 	{`a{0,}`, `a*`},
 	{`a{1,}`, `a+`},
 	{`a{2,}`, `aa+`},
 	{`a{5,}`, `aaaaa+`},
 	// Test that operators simplify their arguments.
 	{`(?:a{1,}){1,}`, `a+`},
 	{`(a{1,}b{1,})`, `(a+b+)`},
 	{`a{1,}|b{1,}`, `a+|b+`},
 	{`(?:a{1,})*`, `(?:a+)*`},
 	{`(?:a{1,})+`, `a+`},
 	{`(?:a{1,})?`, `(?:a+)?`},
 	{``, `(?:)`},
 	{`a{0}`, `(?:)`},
 	// Character class simplification
 	{`[ab]`, `[a-b]`},
 	{`[a-za-za-z]`, `[a-z]`},
 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 	{`[ABCDEFGH]`, `[A-H]`},
 	{`[AB-CD-EF-GH]`, `[A-H]`},
 	{`[W-ZP-XE-R]`, `[E-Z]`},
 	{`[a-ee-gg-m]`, `[a-m]`},
 	{`[a-ea-ha-m]`, `[a-m]`},
 	{`[a-ma-ha-e]`, `[a-m]`},
 	{`[a-zA-Z0-9 -~]`, `[ -~]`},
 	// Empty character classes
 	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
 	// Full character classes
 	{`[[:cntrl:][:^cntrl:]]`, `.`},
 	// Unicode case folding.
 	{`(?i)A`, `(?i:A)`},
 	{`(?i)a`, `(?i:a)`},
 	{`(?i)[A]`, `(?i:A)`},
 	{`(?i)[a]`, `(?i:A)`},
 	{`(?i)K`, `(?i:K)`},
 	{`(?i)k`, `(?i:k)`},
 	{`(?i)\x{212a}`, "(?i:\u212A)"},
 	{`(?i)[K]`, "[Kk\u212A]"},
 	{`(?i)[k]`, "[Kk\u212A]"},
 	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
 	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
 	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
 	{`(?i)[\x00-\x{10FFFF}]`, `.`},
 	// Empty string as a regular expression.
 	// The empty string must be preserved inside parens in order
 	// to make submatches work right, so these tests are less
 	// interesting than they might otherwise be.  String inserts
 	// explicit (?:) in place of non-parenthesized empty strings,
 	// to make them easier to spot for other parsers.
 	{`(a|b|)`, `([a-b]|(?:))`},
 	{`(|)`, `()`},
 	{`a()`, `a()`},
 	{`(()|())`, `(()|())`},
 	{`(a|)`, `(a|(?:))`},
 	{`ab()cd()`, `ab()cd()`},
 	{`()`, `()`},
 	{`()*`, `()*`},
 	{`()+`, `()+`},
 	{`()?`, `()?`},
 	{`(){0}`, `(?:)`},
 	{`(){1}`, `()`},
 	{`(){1,}`, `()+`},
 	{`(){0,2}`, `(?:()()?)?`},
 }
 func TestSimplify(t *testing.T) {
 	for _, tt := range simplifyTests {
 		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
 		if err != nil {
 			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
 			continue
 		}
 		s := re.Simplify().String()
 		if s != tt.Simple {
 			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
 		}
 	}
 }