1
0
mirror of https://github.com/golang/go synced 2024-11-17 09:04:44 -07:00

regexp/syntax: use more compact Regexp.String output

Compact the Regexp.String output. It was only ever intended for debugging,
but there are at least some uses in the wild where regexps are built up
using regexp/syntax and then formatted using the String method.
Compact the output to help that use case. Specifically:

 - Compact 2-element character class ranges: [a-b] -> [ab].
 - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E).

Fixes #57950.

Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/507015
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>
This commit is contained in:
Russ Cox 2023-06-28 17:45:26 -04:00 committed by Gopher Robot
parent 5a3048bf0e
commit 98c9f271d6
4 changed files with 248 additions and 37 deletions

View File

@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune {
return r[:w]
}
// inCharClass reports whether r is in the class.
// It assumes the class has been cleaned by cleanClass.
func inCharClass(r rune, class []rune) bool {
_, ok := sort.Find(len(class)/2, func(i int) int {
lo, hi := class[2*i], class[2*i+1]
if r > hi {
return +1
}
if r < lo {
return -1
}
return 0
})
return ok
}
// appendLiteral returns the result of appending the literal x to the class r.
func appendLiteral(r []rune, x rune, flags Flags) []rune {
if flags&FoldCase != 0 {

View File

@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) {
}
}
}
var stringTests = []struct {
re string
out string
}{
{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
{`[Aa][Bb]`, `(?i:AB)`},
{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
}
func TestString(t *testing.T) {
for _, tt := range stringTests {
re, err := Parse(tt.re, Perl)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.re, err)
continue
}
out := re.String()
if out != tt.out {
t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
}
}
}

View File

@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool {
return true
}
// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8
const (
flagI printFlags = 1 << iota // (?i:
flagM // (?m:
flagS // (?s:
flagOff // )
flagPrec // (?: )
negShift = 5 // flagI<<negShift is (?-i:
)
// addSpan enables the flags f around start..last,
// by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
if *flags == nil {
*flags = make(map[*Regexp]printFlags)
}
(*flags)[start] = f
(*flags)[last] |= flagOff // maybe start==last
}
// calcFlags calculates the flags to print around each subexpression in re,
// storing that information in (*flags)[sub] for each affected subexpression.
// The first time an entry needs to be written to *flags, calcFlags allocates the map.
// calcFlags also calculates the flags that must be active or can't be active
// around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
switch re.Op {
default:
return 0, 0
case OpLiteral:
// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
// according to whether (?i) is active.
// If literal is not fold-sensitive, return 0, 0.
for _, r := range re.Rune {
if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
if re.Flags&FoldCase != 0 {
return flagI, 0
} else {
return 0, flagI
}
}
}
return 0, 0
case OpCharClass:
// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
// If literal is not fold-sensitive, return 0, 0.
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
for r := lo; r <= hi; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
}
return 0, 0
case OpAnyCharNotNL: // (?-s).
return 0, flagS
case OpAnyChar: // (?s).
return flagS, 0
case OpBeginLine, OpEndLine: // (?m)^ (?m)$
return flagM, 0
case OpEndText:
if re.Flags&WasDollar != 0 { // (?-m)$
return 0, flagM
}
return 0, 0
case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
return calcFlags(re.Sub[0], flags)
case OpConcat, OpAlternate:
// Gather the must and cant for each subexpression.
// When we find a conflicting subexpression, insert the necessary
// flags around the previously identified span and start over.
var must, cant, allCant printFlags
start := 0
last := 0
did := false
for i, sub := range re.Sub {
subMust, subCant := calcFlags(sub, flags)
if must&subCant != 0 || subMust&cant != 0 {
if must != 0 {
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
must = 0
cant = 0
start = i
did = true
}
must |= subMust
cant |= subCant
allCant |= subCant
if subMust != 0 {
last = i
}
if must == 0 && start == i {
start++
}
}
if !did {
// No conflicts: pass the accumulated must and cant upward.
return must, cant
}
if must != 0 {
// Conflicts found; need to finish final span.
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
return 0, allCant
}
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp) {
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
f |= flags[re]
if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
// flagPrec is redundant with other flags being added and terminated
f &^= flagPrec
}
if f&^(flagOff|flagPrec) != 0 {
b.WriteString(`(?`)
if f&flagI != 0 {
b.WriteString(`i`)
}
if f&flagM != 0 {
b.WriteString(`m`)
}
if f&flagS != 0 {
b.WriteString(`s`)
}
if f&((flagM|flagS)<<negShift) != 0 {
b.WriteString(`-`)
if f&(flagM<<negShift) != 0 {
b.WriteString(`m`)
}
if f&(flagS<<negShift) != 0 {
b.WriteString(`s`)
}
}
b.WriteString(`:`)
}
if f&flagOff != 0 {
defer b.WriteString(`)`)
}
if f&flagPrec != 0 {
b.WriteString(`(?:`)
defer b.WriteString(`)`)
}
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
if re.Flags&FoldCase != 0 {
b.WriteString(`(?i:`)
}
for _, r := range re.Rune {
escape(b, r, false)
}
if re.Flags&FoldCase != 0 {
b.WriteString(`)`)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL:
b.WriteString(`(?-s:.)`)
case OpAnyChar:
b.WriteString(`(?s:.)`)
case OpAnyCharNotNL, OpAnyChar:
b.WriteString(`.`)
case OpBeginLine:
b.WriteString(`(?m:^)`)
b.WriteString(`^`)
case OpEndLine:
b.WriteString(`(?m:$)`)
b.WriteString(`$`)
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`(?-m:$)`)
b.WriteString(`$`)
} else {
b.WriteString(`\z`)
}
@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0])
writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
p := printFlags(0)
sub := re.Sub[0]
if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
switch re.Op {
case OpStar:
b.WriteRune('*')
@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
}
case OpConcat:
for _, sub := range re.Sub {
p := printFlags(0)
if sub.Op == OpAlternate {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
p = flagPrec
}
writeRegexp(b, sub, p, flags)
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub)
writeRegexp(b, sub, 0, flags)
}
}
}
func (re *Regexp) String() string {
var b strings.Builder
writeRegexp(&b, re)
var flags map[*Regexp]printFlags
must, cant := calcFlags(re, &flags)
must |= (cant &^ flagI) << negShift
if must != 0 {
must |= flagOff
}
writeRegexp(&b, re, must, flags)
return b.String()
}

View File

@ -13,7 +13,7 @@ var simplifyTests = []struct {
// Already-simple constructs
{`a`, `a`},
{`ab`, `ab`},
{`a|b`, `[a-b]`},
{`a|b`, `[ab]`},
{`ab|cd`, `ab|cd`},
{`(ab)*`, `(ab)*`},
{`(ab)+`, `(ab)+`},
@ -40,16 +40,16 @@ var simplifyTests = []struct {
// Perl character classes
{`\d`, `[0-9]`},
{`\s`, `[\t-\n\f-\r ]`},
{`\s`, `[\t\n\f\r ]`},
{`\w`, `[0-9A-Z_a-z]`},
{`\D`, `[^0-9]`},
{`\S`, `[^\t-\n\f-\r ]`},
{`\S`, `[^\t\n\f\r ]`},
{`\W`, `[^0-9A-Z_a-z]`},
{`[\d]`, `[0-9]`},
{`[\s]`, `[\t-\n\f-\r ]`},
{`[\s]`, `[\t\n\f\r ]`},
{`[\w]`, `[0-9A-Z_a-z]`},
{`[\D]`, `[^0-9]`},
{`[\S]`, `[^\t-\n\f-\r ]`},
{`[\S]`, `[^\t\n\f\r ]`},
{`[\W]`, `[^0-9A-Z_a-z]`},
// Posix repetitions
@ -82,7 +82,8 @@ var simplifyTests = []struct {
{`a{0}`, `(?:)`},
// Character class simplification
{`[ab]`, `[a-b]`},
{`[ab]`, `[ab]`},
{`[abc]`, `[a-c]`},
{`[a-za-za-z]`, `[a-z]`},
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
{`[ABCDEFGH]`, `[A-H]`},
@ -120,7 +121,8 @@ var simplifyTests = []struct {
// interesting than they might otherwise be. String inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{`(a|b|)`, `([a-b]|(?:))`},
{`(a|b|c|)`, `([a-c]|(?:))`},
{`(a|b|)`, `([ab]|(?:))`},
{`(|)`, `()`},
{`a()`, `a()`},
{`(()|())`, `(()|())`},