regexp/syntax: use more compact Regexp.String output

Compact the Regexp.String output. It was only ever intended for debugging, but there are at least some uses in the wild where regexps are built up using regexp/syntax and then formatted using the String method. Compact the output to help that use case. Specifically: - Compact 2-element character class ranges: [a-b] -> [ab]. - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). Fixes #57950. Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Than McIntosh <thanm@google.com> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Rob Pike <r@golang.org> Auto-Submit: Russ Cox <rsc@golang.org>
2024-11-17 09:04:44 -07:00 · 2023-06-28 17:45:26 -04:00 · 2023-06-28 17:45:26 -04:00 · 98c9f271d6
commit 98c9f271d6
parent 5a3048bf0e
4 changed files with 248 additions and 37 deletions
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune {
 	return r[:w]
 }

+// inCharClass reports whether r is in the class.
+// It assumes the class has been cleaned by cleanClass.
+func inCharClass(r rune, class []rune) bool {
+	_, ok := sort.Find(len(class)/2, func(i int) int {
+		lo, hi := class[2*i], class[2*i+1]
+		if r > hi {
+			return +1
+		}
+		if r < lo {
+			return -1
+		}
+		return 0
+	})
+	return ok
+}
+
 // appendLiteral returns the result of appending the literal x to the class r.
 func appendLiteral(r []rune, x rune, flags Flags) []rune {
 	if flags&FoldCase != 0 {
--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) {
 		}
 	}
 }
+
+var stringTests = []struct {
+	re  string
+	out string
+}{
+	{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
+	{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
+	{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
+	{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
+	{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
+	{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
+	{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
+	{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
+	{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
+	{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
+	{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
+	{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
+	{`[Aa][Bb]`, `(?i:AB)`},
+	{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
+	{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
+	{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
+}
+
+func TestString(t *testing.T) {
+	for _, tt := range stringTests {
+		re, err := Parse(tt.re, Perl)
+		if err != nil {
+			t.Errorf("Parse(%#q): %v", tt.re, err)
+			continue
+		}
+		out := re.String()
+		if out != tt.out {
+			t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
+		}
+	}
+}
--- a/src/regexp/syntax/regexp.go
+++ b/src/regexp/syntax/regexp.go
@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool {
 	return true
 }

+// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
+type printFlags uint8
+
+const (
+	flagI    printFlags = 1 << iota // (?i:
+	flagM                           // (?m:
+	flagS                           // (?s:
+	flagOff                         // )
+	flagPrec                        // (?: )
+	negShift = 5                    // flagI<<negShift is (?-i:
+)
+
+// addSpan enables the flags f around start..last,
+// by setting flags[start] = f and flags[last] = flagOff.
+func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
+	if *flags == nil {
+		*flags = make(map[*Regexp]printFlags)
+	}
+	(*flags)[start] = f
+	(*flags)[last] |= flagOff // maybe start==last
+}
+
+// calcFlags calculates the flags to print around each subexpression in re,
+// storing that information in (*flags)[sub] for each affected subexpression.
+// The first time an entry needs to be written to *flags, calcFlags allocates the map.
+// calcFlags also calculates the flags that must be active or can't be active
+// around re and returns those flags.
+func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
+	switch re.Op {
+	default:
+		return 0, 0
+
+	case OpLiteral:
+		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
+		// according to whether (?i) is active.
+		// If literal is not fold-sensitive, return 0, 0.
+		for _, r := range re.Rune {
+			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
+				if re.Flags&FoldCase != 0 {
+					return flagI, 0
+				} else {
+					return 0, flagI
+				}
+			}
+		}
+		return 0, 0
+
+	case OpCharClass:
+		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
+		// If literal is not fold-sensitive, return 0, 0.
+		for i := 0; i < len(re.Rune); i += 2 {
+			lo := max(minFold, re.Rune[i])
+			hi := min(maxFold, re.Rune[i+1])
+			for r := lo; r <= hi; r++ {
+				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
+					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
+						return 0, flagI
+					}
+				}
+			}
+		}
+		return 0, 0
+
+	case OpAnyCharNotNL: // (?-s).
+		return 0, flagS
+
+	case OpAnyChar: // (?s).
+		return flagS, 0
+
+	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
+		return flagM, 0
+
+	case OpEndText:
+		if re.Flags&WasDollar != 0 { // (?-m)$
+			return 0, flagM
+		}
+		return 0, 0
+
+	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
+		return calcFlags(re.Sub[0], flags)
+
+	case OpConcat, OpAlternate:
+		// Gather the must and cant for each subexpression.
+		// When we find a conflicting subexpression, insert the necessary
+		// flags around the previously identified span and start over.
+		var must, cant, allCant printFlags
+		start := 0
+		last := 0
+		did := false
+		for i, sub := range re.Sub {
+			subMust, subCant := calcFlags(sub, flags)
+			if must&subCant != 0 || subMust&cant != 0 {
+				if must != 0 {
+					addSpan(re.Sub[start], re.Sub[last], must, flags)
+				}
+				must = 0
+				cant = 0
+				start = i
+				did = true
+			}
+			must |= subMust
+			cant |= subCant
+			allCant |= subCant
+			if subMust != 0 {
+				last = i
+			}
+			if must == 0 && start == i {
+				start++
+			}
+		}
+		if !did {
+			// No conflicts: pass the accumulated must and cant upward.
+			return must, cant
+		}
+		if must != 0 {
+			// Conflicts found; need to finish final span.
+			addSpan(re.Sub[start], re.Sub[last], must, flags)
+		}
+		return 0, allCant
+	}
+}
+
 // writeRegexp writes the Perl syntax for the regular expression re to b.
-func writeRegexp(b *strings.Builder, re *Regexp) {
+func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
+	f |= flags[re]
+	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
+		// flagPrec is redundant with other flags being added and terminated
+		f &^= flagPrec
+	}
+	if f&^(flagOff|flagPrec) != 0 {
+		b.WriteString(`(?`)
+		if f&flagI != 0 {
+			b.WriteString(`i`)
+		}
+		if f&flagM != 0 {
+			b.WriteString(`m`)
+		}
+		if f&flagS != 0 {
+			b.WriteString(`s`)
+		}
+		if f&((flagM|flagS)<<negShift) != 0 {
+			b.WriteString(`-`)
+			if f&(flagM<<negShift) != 0 {
+				b.WriteString(`m`)
+			}
+			if f&(flagS<<negShift) != 0 {
+				b.WriteString(`s`)
+			}
+		}
+		b.WriteString(`:`)
+	}
+	if f&flagOff != 0 {
+		defer b.WriteString(`)`)
+	}
+	if f&flagPrec != 0 {
+		b.WriteString(`(?:`)
+		defer b.WriteString(`)`)
+	}
+
 	switch re.Op {
 	default:
 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 	case OpEmptyMatch:
 		b.WriteString(`(?:)`)
 	case OpLiteral:
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`(?i:`)
-		}
 		for _, r := range re.Rune {
 			escape(b, r, false)
 		}
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`)`)
-		}
 	case OpCharClass:
 		if len(re.Rune)%2 != 0 {
 			b.WriteString(`[invalid char class]`)
@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i], re.Rune[i+1]
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
 		}
 		b.WriteRune(']')
-	case OpAnyCharNotNL:
-		b.WriteString(`(?-s:.)`)
-	case OpAnyChar:
-		b.WriteString(`(?s:.)`)
+	case OpAnyCharNotNL, OpAnyChar:
+		b.WriteString(`.`)
 	case OpBeginLine:
-		b.WriteString(`(?m:^)`)
+		b.WriteString(`^`)
 	case OpEndLine:
-		b.WriteString(`(?m:$)`)
+		b.WriteString(`$`)
 	case OpBeginText:
 		b.WriteString(`\A`)
 	case OpEndText:
 		if re.Flags&WasDollar != 0 {
-			b.WriteString(`(?-m:$)`)
+			b.WriteString(`$`)
 		} else {
 			b.WriteString(`\z`)
 		}
@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 			b.WriteRune('(')
 		}
 		if re.Sub[0].Op != OpEmptyMatch {
-			writeRegexp(b, re.Sub[0])
+			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
 		}
 		b.WriteRune(')')
 	case OpStar, OpPlus, OpQuest, OpRepeat:
-		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
-			b.WriteString(`(?:`)
-			writeRegexp(b, sub)
-			b.WriteString(`)`)
-		} else {
-			writeRegexp(b, sub)
+		p := printFlags(0)
+		sub := re.Sub[0]
+		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+			p = flagPrec
 		}
+		writeRegexp(b, sub, p, flags)
+
 		switch re.Op {
 		case OpStar:
 			b.WriteRune('*')
@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 		}
 	case OpConcat:
 		for _, sub := range re.Sub {
+			p := printFlags(0)
 			if sub.Op == OpAlternate {
-				b.WriteString(`(?:`)
-				writeRegexp(b, sub)
-				b.WriteString(`)`)
-			} else {
-				writeRegexp(b, sub)
+				p = flagPrec
 			}
+			writeRegexp(b, sub, p, flags)
 		}
 	case OpAlternate:
 		for i, sub := range re.Sub {
 			if i > 0 {
 				b.WriteRune('|')
 			}
-			writeRegexp(b, sub)
+			writeRegexp(b, sub, 0, flags)
 		}
 	}
 }

 func (re *Regexp) String() string {
 	var b strings.Builder
-	writeRegexp(&b, re)
+	var flags map[*Regexp]printFlags
+	must, cant := calcFlags(re, &flags)
+	must |= (cant &^ flagI) << negShift
+	if must != 0 {
+		must |= flagOff
+	}
+	writeRegexp(&b, re, must, flags)
 	return b.String()
 }

--- a/src/regexp/syntax/simplify_test.go
+++ b/src/regexp/syntax/simplify_test.go
@ -13,7 +13,7 @@ var simplifyTests = []struct {
 	// Already-simple constructs
 	{`a`, `a`},
 	{`ab`, `ab`},
-	{`a|b`, `[a-b]`},
+	{`a|b`, `[ab]`},
 	{`ab|cd`, `ab|cd`},
 	{`(ab)*`, `(ab)*`},
 	{`(ab)+`, `(ab)+`},
@ -40,16 +40,16 @@ var simplifyTests = []struct {

 	// Perl character classes
 	{`\d`, `[0-9]`},
-	{`\s`, `[\t-\n\f-\r ]`},
+	{`\s`, `[\t\n\f\r ]`},
 	{`\w`, `[0-9A-Z_a-z]`},
 	{`\D`, `[^0-9]`},
-	{`\S`, `[^\t-\n\f-\r ]`},
+	{`\S`, `[^\t\n\f\r ]`},
 	{`\W`, `[^0-9A-Z_a-z]`},
 	{`[\d]`, `[0-9]`},
-	{`[\s]`, `[\t-\n\f-\r ]`},
+	{`[\s]`, `[\t\n\f\r ]`},
 	{`[\w]`, `[0-9A-Z_a-z]`},
 	{`[\D]`, `[^0-9]`},
-	{`[\S]`, `[^\t-\n\f-\r ]`},
+	{`[\S]`, `[^\t\n\f\r ]`},
 	{`[\W]`, `[^0-9A-Z_a-z]`},

 	// Posix repetitions
@ -82,7 +82,8 @@ var simplifyTests = []struct {
 	{`a{0}`, `(?:)`},

 	// Character class simplification
-	{`[ab]`, `[a-b]`},
+	{`[ab]`, `[ab]`},
+	{`[abc]`, `[a-c]`},
 	{`[a-za-za-z]`, `[a-z]`},
 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 	{`[ABCDEFGH]`, `[A-H]`},
@ -120,7 +121,8 @@ var simplifyTests = []struct {
 	// interesting than they might otherwise be. String inserts
 	// explicit (?:) in place of non-parenthesized empty strings,
 	// to make them easier to spot for other parsers.
-	{`(a|b|)`, `([a-b]|(?:))`},
+	{`(a|b|c|)`, `([a-c]|(?:))`},
+	{`(a|b|)`, `([ab]|(?:))`},
 	{`(|)`, `()`},
 	{`a()`, `a()`},
 	{`(()|())`, `(()|())`},