mirror of
https://github.com/golang/go
synced 2024-11-24 23:57:57 -07:00
exp/regexp/syntax: finish Regexp manipulation
Except for the inevitable bug fixes, the Regexp code is done. R=sam.thorogood, r CC=golang-dev https://golang.org/cl/4635082
This commit is contained in:
parent
a809abafa5
commit
b4cae4aee2
@ -106,8 +106,6 @@ func (p *parser) reuse(re *Regexp) {
|
|||||||
|
|
||||||
// push pushes the regexp re onto the parse stack and returns the regexp.
|
// push pushes the regexp re onto the parse stack and returns the regexp.
|
||||||
func (p *parser) push(re *Regexp) *Regexp {
|
func (p *parser) push(re *Regexp) *Regexp {
|
||||||
// TODO: compute simple
|
|
||||||
|
|
||||||
if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
|
if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
|
||||||
// Single rune.
|
// Single rune.
|
||||||
if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
|
if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
|
||||||
@ -250,7 +248,7 @@ func (p *parser) concat() *Regexp {
|
|||||||
return p.push(p.newRegexp(OpEmptyMatch))
|
return p.push(p.newRegexp(OpEmptyMatch))
|
||||||
}
|
}
|
||||||
|
|
||||||
return p.collapse(subs, OpConcat)
|
return p.push(p.collapse(subs, OpConcat))
|
||||||
}
|
}
|
||||||
|
|
||||||
// alternate replaces the top of the stack (above the topmost '(') with its alternation.
|
// alternate replaces the top of the stack (above the topmost '(') with its alternation.
|
||||||
@ -276,7 +274,7 @@ func (p *parser) alternate() *Regexp {
|
|||||||
return p.push(p.newRegexp(OpNoMatch))
|
return p.push(p.newRegexp(OpNoMatch))
|
||||||
}
|
}
|
||||||
|
|
||||||
return p.collapse(subs, OpAlternate)
|
return p.push(p.collapse(subs, OpAlternate))
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanAlt cleans re for eventual inclusion in an alternation.
|
// cleanAlt cleans re for eventual inclusion in an alternation.
|
||||||
@ -302,13 +300,13 @@ func cleanAlt(re *Regexp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// collapse pushes the result of applying op to sub
|
// collapse returns the result of applying op to sub.
|
||||||
// onto the stack. If sub contains op nodes, they all
|
// If sub contains op nodes, they all get hoisted up
|
||||||
// get flattened into a single node.
|
// so that there is never a concat of a concat or an
|
||||||
// sub points into p.stack so it cannot be kept.
|
// alternate of an alternate.
|
||||||
func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
|
func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
|
||||||
if len(subs) == 1 {
|
if len(subs) == 1 {
|
||||||
return p.push(subs[0])
|
return subs[0]
|
||||||
}
|
}
|
||||||
re := p.newRegexp(op)
|
re := p.newRegexp(op)
|
||||||
re.Sub = re.Sub0[:0]
|
re.Sub = re.Sub0[:0]
|
||||||
@ -320,7 +318,295 @@ func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
|
|||||||
re.Sub = append(re.Sub, sub)
|
re.Sub = append(re.Sub, sub)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return p.push(re)
|
if op == OpAlternate {
|
||||||
|
re.Sub = p.factor(re.Sub, re.Flags)
|
||||||
|
if len(re.Sub) == 1 {
|
||||||
|
old := re
|
||||||
|
re = re.Sub[0]
|
||||||
|
p.reuse(old)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
// factor factors common prefixes from the alternation list sub.
|
||||||
|
// It returns a replacement list that reuses the same storage and
|
||||||
|
// frees (passes to p.reuse) any removed *Regexps.
|
||||||
|
//
|
||||||
|
// For example,
|
||||||
|
// ABC|ABD|AEF|BCX|BCY
|
||||||
|
// simplifies by literal prefix extraction to
|
||||||
|
// A(B(C|D)|EF)|BC(X|Y)
|
||||||
|
// which simplifies by character class introduction to
|
||||||
|
// A(B[CD]|EF)|BC[XY]
|
||||||
|
//
|
||||||
|
func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
|
||||||
|
if len(sub) < 2 {
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
|
||||||
|
// Round 1: Factor out common literal prefixes.
|
||||||
|
var str []int
|
||||||
|
var strflags Flags
|
||||||
|
start := 0
|
||||||
|
out := sub[:0]
|
||||||
|
for i := 0; i <= len(sub); i++ {
|
||||||
|
// Invariant: the Regexps that were in sub[0:start] have been
|
||||||
|
// used or marked for reuse, and the slice space has been reused
|
||||||
|
// for out (len(out) <= start).
|
||||||
|
//
|
||||||
|
// Invariant: sub[start:i] consists of regexps that all begin
|
||||||
|
// with str as modified by strflags.
|
||||||
|
var istr []int
|
||||||
|
var iflags Flags
|
||||||
|
if i < len(sub) {
|
||||||
|
istr, iflags = p.leadingString(sub[i])
|
||||||
|
if iflags == strflags {
|
||||||
|
same := 0
|
||||||
|
for same < len(str) && same < len(istr) && str[same] == istr[same] {
|
||||||
|
same++
|
||||||
|
}
|
||||||
|
if same > 0 {
|
||||||
|
// Matches at least one rune in current range.
|
||||||
|
// Keep going around.
|
||||||
|
str = str[:same]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found end of a run with common leading literal string:
|
||||||
|
// sub[start:i] all begin with str[0:len(str)], but sub[i]
|
||||||
|
// does not even begin with str[0].
|
||||||
|
//
|
||||||
|
// Factor out common string and append factored expression to out.
|
||||||
|
if i == start {
|
||||||
|
// Nothing to do - run of length 0.
|
||||||
|
} else if i == start+1 {
|
||||||
|
// Just one: don't bother factoring.
|
||||||
|
out = append(out, sub[start])
|
||||||
|
} else {
|
||||||
|
// Construct factored form: prefix(suffix1|suffix2|...)
|
||||||
|
prefix := p.newRegexp(OpLiteral)
|
||||||
|
prefix.Flags = strflags
|
||||||
|
prefix.Rune = append(prefix.Rune[:0], str...)
|
||||||
|
|
||||||
|
for j := start; j < i; j++ {
|
||||||
|
sub[j] = p.removeLeadingString(sub[j], len(str))
|
||||||
|
}
|
||||||
|
suffix := p.collapse(sub[start:i], OpAlternate) // recurse
|
||||||
|
|
||||||
|
re := p.newRegexp(OpConcat)
|
||||||
|
re.Sub = append(re.Sub[:0], prefix, suffix)
|
||||||
|
out = append(out, re)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare for next iteration.
|
||||||
|
start = i
|
||||||
|
str = istr
|
||||||
|
strflags = iflags
|
||||||
|
}
|
||||||
|
sub = out
|
||||||
|
|
||||||
|
// Round 2: Factor out common complex prefixes,
|
||||||
|
// just the first piece of each concatenation,
|
||||||
|
// whatever it is. This is good enough a lot of the time.
|
||||||
|
start = 0
|
||||||
|
out = sub[:0]
|
||||||
|
var first *Regexp
|
||||||
|
for i := 0; i <= len(sub); i++ {
|
||||||
|
// Invariant: the Regexps that were in sub[0:start] have been
|
||||||
|
// used or marked for reuse, and the slice space has been reused
|
||||||
|
// for out (len(out) <= start).
|
||||||
|
//
|
||||||
|
// Invariant: sub[start:i] consists of regexps that all begin
|
||||||
|
// with str as modified by strflags.
|
||||||
|
var ifirst *Regexp
|
||||||
|
if i < len(sub) {
|
||||||
|
ifirst = p.leadingRegexp(sub[i])
|
||||||
|
if first != nil && first.Equal(ifirst) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found end of a run with common leading regexp:
|
||||||
|
// sub[start:i] all begin with first but sub[i] does not.
|
||||||
|
//
|
||||||
|
// Factor out common regexp and append factored expression to out.
|
||||||
|
if i == start {
|
||||||
|
// Nothing to do - run of length 0.
|
||||||
|
} else if i == start+1 {
|
||||||
|
// Just one: don't bother factoring.
|
||||||
|
out = append(out, sub[start])
|
||||||
|
} else {
|
||||||
|
// Construct factored form: prefix(suffix1|suffix2|...)
|
||||||
|
prefix := first
|
||||||
|
|
||||||
|
for j := start; j < i; j++ {
|
||||||
|
reuse := j != start // prefix came from sub[start]
|
||||||
|
sub[j] = p.removeLeadingRegexp(sub[j], reuse)
|
||||||
|
}
|
||||||
|
suffix := p.collapse(sub[start:i], OpAlternate) // recurse
|
||||||
|
|
||||||
|
re := p.newRegexp(OpConcat)
|
||||||
|
re.Sub = append(re.Sub[:0], prefix, suffix)
|
||||||
|
out = append(out, re)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare for next iteration.
|
||||||
|
start = i
|
||||||
|
first = ifirst
|
||||||
|
}
|
||||||
|
sub = out
|
||||||
|
|
||||||
|
// Round 3: Collapse runs of single literals into character classes.
|
||||||
|
start = 0
|
||||||
|
out = sub[:0]
|
||||||
|
for i := 0; i <= len(sub); i++ {
|
||||||
|
// Invariant: the Regexps that were in sub[0:start] have been
|
||||||
|
// used or marked for reuse, and the slice space has been reused
|
||||||
|
// for out (len(out) <= start).
|
||||||
|
//
|
||||||
|
// Invariant: sub[start:i] consists of regexps that are either
|
||||||
|
// literal runes or character classes.
|
||||||
|
if i < len(sub) && isCharClass(sub[i]) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// sub[i] is not a char or char class;
|
||||||
|
// emit char class for sub[start:i]...
|
||||||
|
if i == start {
|
||||||
|
// Nothing to do - run of length 0.
|
||||||
|
} else if i == start+1 {
|
||||||
|
out = append(out, sub[start])
|
||||||
|
} else {
|
||||||
|
// Make new char class.
|
||||||
|
// Start with most complex regexp in sub[start].
|
||||||
|
max := start
|
||||||
|
for j := start + 1; j < i; j++ {
|
||||||
|
if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
|
||||||
|
max = j
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub[start], sub[max] = sub[max], sub[start]
|
||||||
|
|
||||||
|
for j := start + 1; j < i; j++ {
|
||||||
|
mergeCharClass(sub[start], sub[j])
|
||||||
|
p.reuse(sub[j])
|
||||||
|
}
|
||||||
|
cleanAlt(sub[start])
|
||||||
|
out = append(out, sub[start])
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... and then emit sub[i].
|
||||||
|
if i < len(sub) {
|
||||||
|
out = append(out, sub[i])
|
||||||
|
}
|
||||||
|
start = i + 1
|
||||||
|
}
|
||||||
|
sub = out
|
||||||
|
|
||||||
|
// Round 4: Collapse runs of empty matches into a single empty match.
|
||||||
|
start = 0
|
||||||
|
out = sub[:0]
|
||||||
|
for i := range sub {
|
||||||
|
if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, sub[i])
|
||||||
|
}
|
||||||
|
sub = out
|
||||||
|
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
|
||||||
|
// leadingString returns the leading literal string that re begins with.
|
||||||
|
// The string refers to storage in re or its children.
|
||||||
|
func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
|
||||||
|
if re.Op == OpConcat && len(re.Sub) > 0 {
|
||||||
|
re = re.Sub[0]
|
||||||
|
}
|
||||||
|
if re.Op != OpLiteral {
|
||||||
|
return nil, 0
|
||||||
|
}
|
||||||
|
return re.Rune, re.Flags & FoldCase
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeLeadingString removes the first n leading runes
|
||||||
|
// from the beginning of re. It returns the replacement for re.
|
||||||
|
func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
|
||||||
|
if re.Op == OpConcat && len(re.Sub) > 0 {
|
||||||
|
// Removing a leading string in a concatenation
|
||||||
|
// might simplify the concatenation.
|
||||||
|
sub := re.Sub[0]
|
||||||
|
sub = p.removeLeadingString(sub, n)
|
||||||
|
re.Sub[0] = sub
|
||||||
|
if sub.Op == OpEmptyMatch {
|
||||||
|
p.reuse(sub)
|
||||||
|
switch len(re.Sub) {
|
||||||
|
case 0, 1:
|
||||||
|
// Impossible but handle.
|
||||||
|
re.Op = OpEmptyMatch
|
||||||
|
re.Sub = nil
|
||||||
|
case 2:
|
||||||
|
old := re
|
||||||
|
re = re.Sub[1]
|
||||||
|
p.reuse(old)
|
||||||
|
default:
|
||||||
|
copy(re.Sub, re.Sub[1:])
|
||||||
|
re.Sub = re.Sub[:len(re.Sub)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
if re.Op == OpLiteral {
|
||||||
|
re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
|
||||||
|
if len(re.Rune) == 0 {
|
||||||
|
re.Op = OpEmptyMatch
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
// leadingRegexp returns the leading regexp that re begins with.
|
||||||
|
// The regexp refers to storage in re or its children.
|
||||||
|
func (p *parser) leadingRegexp(re *Regexp) *Regexp {
|
||||||
|
if re.Op == OpEmptyMatch {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if re.Op == OpConcat && len(re.Sub) > 0 {
|
||||||
|
sub := re.Sub[0]
|
||||||
|
if sub.Op == OpEmptyMatch {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeLeadingRegexp removes the leading regexp in re.
|
||||||
|
// It returns the replacement for re.
|
||||||
|
// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
|
||||||
|
func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
|
||||||
|
if re.Op == OpConcat && len(re.Sub) > 0 {
|
||||||
|
if reuse {
|
||||||
|
p.reuse(re.Sub[0])
|
||||||
|
}
|
||||||
|
re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
|
||||||
|
switch len(re.Sub) {
|
||||||
|
case 0:
|
||||||
|
re.Op = OpEmptyMatch
|
||||||
|
re.Sub = nil
|
||||||
|
case 1:
|
||||||
|
old := re
|
||||||
|
re = re.Sub[0]
|
||||||
|
p.reuse(old)
|
||||||
|
}
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
re.Op = OpEmptyMatch
|
||||||
|
return re
|
||||||
}
|
}
|
||||||
|
|
||||||
func literalRegexp(s string, flags Flags) *Regexp {
|
func literalRegexp(s string, flags Flags) *Regexp {
|
||||||
@ -752,6 +1038,36 @@ func (p *parser) parseVerticalBar() os.Error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mergeCharClass makes dst = dst|src.
|
||||||
|
// The caller must ensure that dst.Op >= src.Op,
|
||||||
|
// to reduce the amount of copying.
|
||||||
|
func mergeCharClass(dst, src *Regexp) {
|
||||||
|
switch dst.Op {
|
||||||
|
case OpAnyChar:
|
||||||
|
// src doesn't add anything.
|
||||||
|
case OpAnyCharNotNL:
|
||||||
|
// src might add \n
|
||||||
|
if matchRune(src, '\n') {
|
||||||
|
dst.Op = OpAnyChar
|
||||||
|
}
|
||||||
|
case OpCharClass:
|
||||||
|
// src is simpler, so either literal or char class
|
||||||
|
if src.Op == OpLiteral {
|
||||||
|
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
|
||||||
|
} else {
|
||||||
|
dst.Rune = appendClass(dst.Rune, src.Rune)
|
||||||
|
}
|
||||||
|
case OpLiteral:
|
||||||
|
// both literal
|
||||||
|
if src.Rune[0] == dst.Rune[0] {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst.Op = OpCharClass
|
||||||
|
dst.Rune = append(dst.Rune, dst.Rune[0])
|
||||||
|
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If the top of the stack is an element followed by an opVerticalBar
|
// If the top of the stack is an element followed by an opVerticalBar
|
||||||
// swapVerticalBar swaps the two and returns true.
|
// swapVerticalBar swaps the two and returns true.
|
||||||
// Otherwise it returns false.
|
// Otherwise it returns false.
|
||||||
@ -767,30 +1083,7 @@ func (p *parser) swapVerticalBar() bool {
|
|||||||
re1, re3 = re3, re1
|
re1, re3 = re3, re1
|
||||||
p.stack[n-3] = re3
|
p.stack[n-3] = re3
|
||||||
}
|
}
|
||||||
switch re3.Op {
|
mergeCharClass(re3, re1)
|
||||||
case OpAnyChar:
|
|
||||||
// re1 doesn't add anything.
|
|
||||||
case OpAnyCharNotNL:
|
|
||||||
// re1 might add \n
|
|
||||||
if matchRune(re1, '\n') {
|
|
||||||
re3.Op = OpAnyChar
|
|
||||||
}
|
|
||||||
case OpCharClass:
|
|
||||||
// re1 is simpler, so either literal or char class
|
|
||||||
if re1.Op == OpLiteral {
|
|
||||||
re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
|
|
||||||
} else {
|
|
||||||
re3.Rune = appendClass(re3.Rune, re1.Rune)
|
|
||||||
}
|
|
||||||
case OpLiteral:
|
|
||||||
// both literal
|
|
||||||
if re1.Rune[0] == re3.Rune[0] {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
re3.Op = OpCharClass
|
|
||||||
re3.Rune = append(re3.Rune, re3.Rune[0])
|
|
||||||
re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
|
|
||||||
}
|
|
||||||
p.reuse(re1)
|
p.reuse(re1)
|
||||||
p.stack = p.stack[:n-1]
|
p.stack = p.stack[:n-1]
|
||||||
return true
|
return true
|
||||||
@ -1432,10 +1725,11 @@ func negateClass(r []int) []int {
|
|||||||
}
|
}
|
||||||
nextLo = hi + 1
|
nextLo = hi + 1
|
||||||
}
|
}
|
||||||
|
r = r[:w]
|
||||||
if nextLo <= unicode.MaxRune {
|
if nextLo <= unicode.MaxRune {
|
||||||
// It's possible for the negation to have one more
|
// It's possible for the negation to have one more
|
||||||
// range - this one - than the original class, so use append.
|
// range - this one - than the original class, so use append.
|
||||||
r = append(r[:w], nextLo, unicode.MaxRune)
|
r = append(r, nextLo, unicode.MaxRune)
|
||||||
}
|
}
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
@ -39,8 +39,7 @@ var parseTests = []struct {
|
|||||||
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
|
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
|
||||||
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
|
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
|
||||||
{``, `emp{}`},
|
{``, `emp{}`},
|
||||||
// { `|`, `emp{}` }, // alt{emp{}emp{}} but got factored
|
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
|
||||||
{`|`, `alt{emp{}emp{}}`},
|
|
||||||
{`|x|`, `alt{emp{}lit{x}emp{}}`},
|
{`|x|`, `alt{emp{}lit{x}emp{}}`},
|
||||||
{`.`, `dot{}`},
|
{`.`, `dot{}`},
|
||||||
{`^`, `bol{}`},
|
{`^`, `bol{}`},
|
||||||
@ -64,6 +63,9 @@ var parseTests = []struct {
|
|||||||
{`\-`, `lit{-}`},
|
{`\-`, `lit{-}`},
|
||||||
{`-`, `lit{-}`},
|
{`-`, `lit{-}`},
|
||||||
{`\_`, `lit{_}`},
|
{`\_`, `lit{_}`},
|
||||||
|
{`abc`, `str{abc}`},
|
||||||
|
{`abc|def`, `alt{str{abc}str{def}}`},
|
||||||
|
{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
|
||||||
|
|
||||||
// Posix and Perl extensions
|
// Posix and Perl extensions
|
||||||
{`[[:lower:]]`, `cc{0x61-0x7a}`},
|
{`[[:lower:]]`, `cc{0x61-0x7a}`},
|
||||||
@ -156,6 +158,10 @@ var parseTests = []struct {
|
|||||||
// Strings
|
// Strings
|
||||||
{`abcde`, `str{abcde}`},
|
{`abcde`, `str{abcde}`},
|
||||||
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
|
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
|
||||||
|
|
||||||
|
// Factoring.
|
||||||
|
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
|
||||||
|
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
|
||||||
}
|
}
|
||||||
|
|
||||||
const testFlags = MatchNL | PerlX | UnicodeGroups
|
const testFlags = MatchNL | PerlX | UnicodeGroups
|
||||||
|
@ -60,6 +60,59 @@ const (
|
|||||||
|
|
||||||
const opPseudo Op = 128 // where pseudo-ops start
|
const opPseudo Op = 128 // where pseudo-ops start
|
||||||
|
|
||||||
|
// Equal returns true if x and y have identical structure.
|
||||||
|
func (x *Regexp) Equal(y *Regexp) bool {
|
||||||
|
if x == nil || y == nil {
|
||||||
|
return x == y
|
||||||
|
}
|
||||||
|
if x.Op != y.Op {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
switch x.Op {
|
||||||
|
case OpEndText:
|
||||||
|
// The parse flags remember whether this is \z or \Z.
|
||||||
|
if x.Flags&WasDollar != y.Flags&WasDollar {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
case OpLiteral, OpCharClass:
|
||||||
|
if len(x.Rune) != len(y.Rune) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i, r := range x.Rune {
|
||||||
|
if r != y.Rune[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case OpAlternate, OpConcat:
|
||||||
|
if len(x.Sub) != len(y.Sub) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i, sub := range x.Sub {
|
||||||
|
if !sub.Equal(y.Sub[i]) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case OpStar, OpPlus, OpQuest:
|
||||||
|
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
case OpRepeat:
|
||||||
|
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
case OpCapture:
|
||||||
|
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
||||||
func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
||||||
switch re.Op {
|
switch re.Op {
|
||||||
@ -70,16 +123,24 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
|||||||
case OpEmptyMatch:
|
case OpEmptyMatch:
|
||||||
b.WriteString(`(?:)`)
|
b.WriteString(`(?:)`)
|
||||||
case OpLiteral:
|
case OpLiteral:
|
||||||
|
if re.Flags&FoldCase != 0 {
|
||||||
|
b.WriteString(`(?i:`)
|
||||||
|
}
|
||||||
for _, r := range re.Rune {
|
for _, r := range re.Rune {
|
||||||
escape(b, r, false)
|
escape(b, r, false)
|
||||||
}
|
}
|
||||||
|
if re.Flags&FoldCase != 0 {
|
||||||
|
b.WriteString(`)`)
|
||||||
|
}
|
||||||
case OpCharClass:
|
case OpCharClass:
|
||||||
if len(re.Rune)%2 != 0 {
|
if len(re.Rune)%2 != 0 {
|
||||||
b.WriteString(`[invalid char class]`)
|
b.WriteString(`[invalid char class]`)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
b.WriteRune('[')
|
b.WriteRune('[')
|
||||||
if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
|
if len(re.Rune) == 0 {
|
||||||
|
b.WriteString(`^\x00-\x{10FFFF}`)
|
||||||
|
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
|
||||||
// Contains 0 and MaxRune. Probably a negated class.
|
// Contains 0 and MaxRune. Probably a negated class.
|
||||||
// Print the gaps.
|
// Print the gaps.
|
||||||
b.WriteRune('^')
|
b.WriteRune('^')
|
||||||
@ -126,7 +187,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
|||||||
} else {
|
} else {
|
||||||
b.WriteRune('(')
|
b.WriteRune('(')
|
||||||
}
|
}
|
||||||
writeRegexp(b, re.Sub[0])
|
if re.Sub[0].Op != OpEmptyMatch {
|
||||||
|
writeRegexp(b, re.Sub[0])
|
||||||
|
}
|
||||||
b.WriteRune(')')
|
b.WriteRune(')')
|
||||||
case OpStar, OpPlus, OpQuest, OpRepeat:
|
case OpStar, OpPlus, OpQuest, OpRepeat:
|
||||||
if sub := re.Sub[0]; sub.Op > OpCapture {
|
if sub := re.Sub[0]; sub.Op > OpCapture {
|
||||||
@ -205,6 +268,15 @@ func escape(b *bytes.Buffer, r int, force bool) {
|
|||||||
case '\v':
|
case '\v':
|
||||||
b.WriteString(`\v`)
|
b.WriteString(`\v`)
|
||||||
default:
|
default:
|
||||||
|
if r < 0x100 {
|
||||||
|
b.WriteString(`\x`)
|
||||||
|
s := strconv.Itob(r, 16)
|
||||||
|
if len(s) == 1 {
|
||||||
|
b.WriteRune('0')
|
||||||
|
}
|
||||||
|
b.WriteString(s)
|
||||||
|
break
|
||||||
|
}
|
||||||
b.WriteString(`\x{`)
|
b.WriteString(`\x{`)
|
||||||
b.WriteString(strconv.Itob(r, 16))
|
b.WriteString(strconv.Itob(r, 16))
|
||||||
b.WriteString(`}`)
|
b.WriteString(`}`)
|
||||||
|
151
src/pkg/exp/regexp/syntax/simplify.go
Normal file
151
src/pkg/exp/regexp/syntax/simplify.go
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package syntax
|
||||||
|
|
||||||
|
// Simplify returns a regexp equivalent to re but without counted repetitions
|
||||||
|
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
|
||||||
|
// The resulting regexp will execute correctly but its string representation
|
||||||
|
// will not produce the same parse tree, because capturing parentheses
|
||||||
|
// may have been duplicated or removed. For example, the simplified form
|
||||||
|
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
|
||||||
|
// The returned regexp may share structure with or be the original.
|
||||||
|
func (re *Regexp) Simplify() *Regexp {
|
||||||
|
if re == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch re.Op {
|
||||||
|
case OpCapture, OpConcat, OpAlternate:
|
||||||
|
// Simplify children, building new Regexp if children change.
|
||||||
|
nre := re
|
||||||
|
for i, sub := range re.Sub {
|
||||||
|
nsub := sub.Simplify()
|
||||||
|
if nre == re && nsub != sub {
|
||||||
|
// Start a copy.
|
||||||
|
nre = new(Regexp)
|
||||||
|
*nre = *re
|
||||||
|
nre.Rune = nil
|
||||||
|
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
|
||||||
|
}
|
||||||
|
if nre != re {
|
||||||
|
nre.Sub = append(nre.Sub, nsub)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nre
|
||||||
|
|
||||||
|
case OpStar, OpPlus, OpQuest:
|
||||||
|
sub := re.Sub[0].Simplify()
|
||||||
|
return simplify1(re.Op, re.Flags, sub, re)
|
||||||
|
|
||||||
|
case OpRepeat:
|
||||||
|
// Special special case: x{0} matches the empty string
|
||||||
|
// and doesn't even need to consider x.
|
||||||
|
if re.Min == 0 && re.Max == 0 {
|
||||||
|
return &Regexp{Op: OpEmptyMatch}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The fun begins.
|
||||||
|
sub := re.Sub[0].Simplify()
|
||||||
|
|
||||||
|
// x{n,} means at least n matches of x.
|
||||||
|
if re.Max == -1 {
|
||||||
|
// Special case: x{0,} is x*.
|
||||||
|
if re.Min == 0 {
|
||||||
|
return simplify1(OpStar, re.Flags, sub, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case: x{1,} is x+.
|
||||||
|
if re.Min == 1 {
|
||||||
|
return simplify1(OpPlus, re.Flags, sub, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// General case: x{4,} is xxxx+.
|
||||||
|
nre := &Regexp{Op: OpConcat}
|
||||||
|
nre.Sub = nre.Sub0[:0]
|
||||||
|
for i := 0; i < re.Min-1; i++ {
|
||||||
|
nre.Sub = append(nre.Sub, sub)
|
||||||
|
}
|
||||||
|
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
|
||||||
|
return nre
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case x{0} handled above.
|
||||||
|
|
||||||
|
// Special case: x{1} is just x.
|
||||||
|
if re.Min == 1 && re.Max == 1 {
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
|
||||||
|
// General case: x{n,m} means n copies of x and m copies of x?
|
||||||
|
// The machine will do less work if we nest the final m copies,
|
||||||
|
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||||
|
|
||||||
|
// Build leading prefix: xx.
|
||||||
|
var prefix *Regexp
|
||||||
|
if re.Min > 0 {
|
||||||
|
prefix = &Regexp{Op: OpConcat}
|
||||||
|
prefix.Sub = prefix.Sub0[:0]
|
||||||
|
for i := 0; i < re.Min; i++ {
|
||||||
|
prefix.Sub = append(prefix.Sub, sub)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build and attach suffix: (x(x(x)?)?)?
|
||||||
|
if re.Max > re.Min {
|
||||||
|
suffix := simplify1(OpQuest, re.Flags, sub, nil)
|
||||||
|
for i := re.Min + 1; i < re.Max; i++ {
|
||||||
|
nre2 := &Regexp{Op: OpConcat}
|
||||||
|
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
|
||||||
|
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
|
||||||
|
}
|
||||||
|
if prefix == nil {
|
||||||
|
return suffix
|
||||||
|
}
|
||||||
|
prefix.Sub = append(prefix.Sub, suffix)
|
||||||
|
}
|
||||||
|
if prefix != nil {
|
||||||
|
return prefix
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some degenerate case like min > max or min < max < 0.
|
||||||
|
// Handle as impossible match.
|
||||||
|
return &Regexp{Op: OpNoMatch}
|
||||||
|
}
|
||||||
|
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
// simplify1 implements Simplify for the unary OpStar,
|
||||||
|
// OpPlus, and OpQuest operators. It returns the simple regexp
|
||||||
|
// equivalent to
|
||||||
|
//
|
||||||
|
// Regexp{Op: op, Flags: flags, Sub: {sub}}
|
||||||
|
//
|
||||||
|
// under the assumption that sub is already simple, and
|
||||||
|
// without first allocating that structure. If the regexp
|
||||||
|
// to be returned turns out to be equivalent to re, simplify1
|
||||||
|
// returns re instead.
|
||||||
|
//
|
||||||
|
// simplify1 is factored out of Simplify because the implementation
|
||||||
|
// for other operators generates these unary expressions.
|
||||||
|
// Letting them call simplify1 makes sure the expressions they
|
||||||
|
// generate are simple.
|
||||||
|
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
|
||||||
|
// Special case: repeat the empty string as much as
|
||||||
|
// you want, but it's still the empty string.
|
||||||
|
if sub.Op == OpEmptyMatch {
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
// The operators are idempotent if the flags match.
|
||||||
|
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
|
||||||
|
return sub
|
||||||
|
}
|
||||||
|
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
|
||||||
|
return re
|
||||||
|
}
|
||||||
|
|
||||||
|
re = &Regexp{Op: op, Flags: flags}
|
||||||
|
re.Sub = append(re.Sub0[:0], sub)
|
||||||
|
return re
|
||||||
|
}
|
151
src/pkg/exp/regexp/syntax/simplify_test.go
Normal file
151
src/pkg/exp/regexp/syntax/simplify_test.go
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package syntax
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
var simplifyTests = []struct {
|
||||||
|
Regexp string
|
||||||
|
Simple string
|
||||||
|
}{
|
||||||
|
// Already-simple constructs
|
||||||
|
{`a`, `a`},
|
||||||
|
{`ab`, `ab`},
|
||||||
|
{`a|b`, `[a-b]`},
|
||||||
|
{`ab|cd`, `ab|cd`},
|
||||||
|
{`(ab)*`, `(ab)*`},
|
||||||
|
{`(ab)+`, `(ab)+`},
|
||||||
|
{`(ab)?`, `(ab)?`},
|
||||||
|
{`.`, `.`},
|
||||||
|
{`^`, `^`},
|
||||||
|
{`$`, `$`},
|
||||||
|
{`[ac]`, `[ac]`},
|
||||||
|
{`[^ac]`, `[^ac]`},
|
||||||
|
|
||||||
|
// Posix character classes
|
||||||
|
{`[[:alnum:]]`, `[0-9A-Za-z]`},
|
||||||
|
{`[[:alpha:]]`, `[A-Za-z]`},
|
||||||
|
{`[[:blank:]]`, `[\t ]`},
|
||||||
|
{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
|
||||||
|
{`[[:digit:]]`, `[0-9]`},
|
||||||
|
{`[[:graph:]]`, `[!-~]`},
|
||||||
|
{`[[:lower:]]`, `[a-z]`},
|
||||||
|
{`[[:print:]]`, `[ -~]`},
|
||||||
|
{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
|
||||||
|
{`[[:space:]]`, `[\t-\r ]`},
|
||||||
|
{`[[:upper:]]`, `[A-Z]`},
|
||||||
|
{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
|
||||||
|
|
||||||
|
// Perl character classes
|
||||||
|
{`\d`, `[0-9]`},
|
||||||
|
{`\s`, `[\t-\n\f-\r ]`},
|
||||||
|
{`\w`, `[0-9A-Z_a-z]`},
|
||||||
|
{`\D`, `[^0-9]`},
|
||||||
|
{`\S`, `[^\t-\n\f-\r ]`},
|
||||||
|
{`\W`, `[^0-9A-Z_a-z]`},
|
||||||
|
{`[\d]`, `[0-9]`},
|
||||||
|
{`[\s]`, `[\t-\n\f-\r ]`},
|
||||||
|
{`[\w]`, `[0-9A-Z_a-z]`},
|
||||||
|
{`[\D]`, `[^0-9]`},
|
||||||
|
{`[\S]`, `[^\t-\n\f-\r ]`},
|
||||||
|
{`[\W]`, `[^0-9A-Z_a-z]`},
|
||||||
|
|
||||||
|
// Posix repetitions
|
||||||
|
{`a{1}`, `a`},
|
||||||
|
{`a{2}`, `aa`},
|
||||||
|
{`a{5}`, `aaaaa`},
|
||||||
|
{`a{0,1}`, `a?`},
|
||||||
|
// The next three are illegible because Simplify inserts (?:)
|
||||||
|
// parens instead of () parens to avoid creating extra
|
||||||
|
// captured subexpressions. The comments show a version with fewer parens.
|
||||||
|
{`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
|
||||||
|
{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
|
||||||
|
{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
|
||||||
|
{`a{0,2}`, `(?:aa?)?`}, // (aa?)?
|
||||||
|
{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
|
||||||
|
{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
|
||||||
|
{`a{0,}`, `a*`},
|
||||||
|
{`a{1,}`, `a+`},
|
||||||
|
{`a{2,}`, `aa+`},
|
||||||
|
{`a{5,}`, `aaaaa+`},
|
||||||
|
|
||||||
|
// Test that operators simplify their arguments.
|
||||||
|
{`(?:a{1,}){1,}`, `a+`},
|
||||||
|
{`(a{1,}b{1,})`, `(a+b+)`},
|
||||||
|
{`a{1,}|b{1,}`, `a+|b+`},
|
||||||
|
{`(?:a{1,})*`, `(?:a+)*`},
|
||||||
|
{`(?:a{1,})+`, `a+`},
|
||||||
|
{`(?:a{1,})?`, `(?:a+)?`},
|
||||||
|
{``, `(?:)`},
|
||||||
|
{`a{0}`, `(?:)`},
|
||||||
|
|
||||||
|
// Character class simplification
|
||||||
|
{`[ab]`, `[a-b]`},
|
||||||
|
{`[a-za-za-z]`, `[a-z]`},
|
||||||
|
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
|
||||||
|
{`[ABCDEFGH]`, `[A-H]`},
|
||||||
|
{`[AB-CD-EF-GH]`, `[A-H]`},
|
||||||
|
{`[W-ZP-XE-R]`, `[E-Z]`},
|
||||||
|
{`[a-ee-gg-m]`, `[a-m]`},
|
||||||
|
{`[a-ea-ha-m]`, `[a-m]`},
|
||||||
|
{`[a-ma-ha-e]`, `[a-m]`},
|
||||||
|
{`[a-zA-Z0-9 -~]`, `[ -~]`},
|
||||||
|
|
||||||
|
// Empty character classes
|
||||||
|
{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
|
||||||
|
|
||||||
|
// Full character classes
|
||||||
|
{`[[:cntrl:][:^cntrl:]]`, `.`},
|
||||||
|
|
||||||
|
// Unicode case folding.
|
||||||
|
{`(?i)A`, `(?i:A)`},
|
||||||
|
{`(?i)a`, `(?i:a)`},
|
||||||
|
{`(?i)[A]`, `(?i:A)`},
|
||||||
|
{`(?i)[a]`, `(?i:A)`},
|
||||||
|
{`(?i)K`, `(?i:K)`},
|
||||||
|
{`(?i)k`, `(?i:k)`},
|
||||||
|
{`(?i)\x{212a}`, "(?i:\u212A)"},
|
||||||
|
{`(?i)[K]`, "[Kk\u212A]"},
|
||||||
|
{`(?i)[k]`, "[Kk\u212A]"},
|
||||||
|
{`(?i)[\x{212a}]`, "[Kk\u212A]"},
|
||||||
|
{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
|
||||||
|
{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
|
||||||
|
{`(?i)[\x00-\x{10FFFF}]`, `.`},
|
||||||
|
|
||||||
|
// Empty string as a regular expression.
|
||||||
|
// The empty string must be preserved inside parens in order
|
||||||
|
// to make submatches work right, so these tests are less
|
||||||
|
// interesting than they might otherwise be. String inserts
|
||||||
|
// explicit (?:) in place of non-parenthesized empty strings,
|
||||||
|
// to make them easier to spot for other parsers.
|
||||||
|
{`(a|b|)`, `([a-b]|(?:))`},
|
||||||
|
{`(|)`, `()`},
|
||||||
|
{`a()`, `a()`},
|
||||||
|
{`(()|())`, `(()|())`},
|
||||||
|
{`(a|)`, `(a|(?:))`},
|
||||||
|
{`ab()cd()`, `ab()cd()`},
|
||||||
|
{`()`, `()`},
|
||||||
|
{`()*`, `()*`},
|
||||||
|
{`()+`, `()+`},
|
||||||
|
{`()?`, `()?`},
|
||||||
|
{`(){0}`, `(?:)`},
|
||||||
|
{`(){1}`, `()`},
|
||||||
|
{`(){1,}`, `()+`},
|
||||||
|
{`(){0,2}`, `(?:()()?)?`},
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSimplify(t *testing.T) {
|
||||||
|
for _, tt := range simplifyTests {
|
||||||
|
re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s := re.Simplify().String()
|
||||||
|
if s != tt.Simple {
|
||||||
|
t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user