mirror of
https://github.com/golang/go
synced 2024-11-22 01:14:40 -07:00
exp/regexp/syntax: case-folding in character classes
Also fix \x{123} parsing. R=r CC=golang-dev https://golang.org/cl/4632052
This commit is contained in:
parent
6b648cafde
commit
52cd055f91
@ -81,6 +81,7 @@ type parser struct {
|
|||||||
stack []*Regexp // stack of parsed expressions
|
stack []*Regexp // stack of parsed expressions
|
||||||
numCap int // number of capturing groups seen
|
numCap int // number of capturing groups seen
|
||||||
wholeRegexp string
|
wholeRegexp string
|
||||||
|
tmpClass []int // temporary char class work space
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse stack manipulation.
|
// Parse stack manipulation.
|
||||||
@ -371,7 +372,6 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
|
|||||||
if r != nil {
|
if r != nil {
|
||||||
re.Rune = r
|
re.Rune = r
|
||||||
t = rest
|
t = rest
|
||||||
// TODO: Handle FoldCase flag.
|
|
||||||
p.push(re)
|
p.push(re)
|
||||||
break BigSwitch
|
break BigSwitch
|
||||||
}
|
}
|
||||||
@ -729,6 +729,7 @@ Switch:
|
|||||||
if r > unicode.MaxRune {
|
if r > unicode.MaxRune {
|
||||||
break Switch
|
break Switch
|
||||||
}
|
}
|
||||||
|
nhex++
|
||||||
}
|
}
|
||||||
if nhex == 0 {
|
if nhex == 0 {
|
||||||
break Switch
|
break Switch
|
||||||
@ -801,12 +802,7 @@ func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string
|
|||||||
if g.sign == 0 {
|
if g.sign == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if g.sign < 0 {
|
return p.appendGroup(r, g), s[2:]
|
||||||
r = appendNegatedClass(r, g.class)
|
|
||||||
} else {
|
|
||||||
r = appendClass(r, g.class)
|
|
||||||
}
|
|
||||||
return r, s[2:]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
|
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
|
||||||
@ -827,23 +823,40 @@ func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err
|
|||||||
if g.sign == 0 {
|
if g.sign == 0 {
|
||||||
return nil, "", &Error{ErrInvalidCharRange, name}
|
return nil, "", &Error{ErrInvalidCharRange, name}
|
||||||
}
|
}
|
||||||
if g.sign < 0 {
|
return p.appendGroup(r, g), s, nil
|
||||||
r = appendNegatedClass(r, g.class)
|
|
||||||
} else {
|
|
||||||
r = appendClass(r, g.class)
|
|
||||||
}
|
|
||||||
return r, s, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// unicodeTable returns the unicode.RangeTable identified by name.
|
func (p *parser) appendGroup(r []int, g charGroup) []int {
|
||||||
func unicodeTable(name string) *unicode.RangeTable {
|
if p.flags&FoldCase == 0 {
|
||||||
|
if g.sign < 0 {
|
||||||
|
r = appendNegatedClass(r, g.class)
|
||||||
|
} else {
|
||||||
|
r = appendClass(r, g.class)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tmp := p.tmpClass[:0]
|
||||||
|
tmp = appendFoldedClass(tmp, g.class)
|
||||||
|
p.tmpClass = tmp
|
||||||
|
tmp = cleanClass(&p.tmpClass)
|
||||||
|
if g.sign < 0 {
|
||||||
|
r = appendNegatedClass(r, tmp)
|
||||||
|
} else {
|
||||||
|
r = appendClass(r, tmp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicodeTable returns the unicode.RangeTable identified by name
|
||||||
|
// and the table of additional fold-equivalent code points.
|
||||||
|
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
|
||||||
if t := unicode.Categories[name]; t != nil {
|
if t := unicode.Categories[name]; t != nil {
|
||||||
return t
|
return t, unicode.FoldCategory[name]
|
||||||
}
|
}
|
||||||
if t := unicode.Scripts[name]; t != nil {
|
if t := unicode.Scripts[name]; t != nil {
|
||||||
return t
|
return t, unicode.FoldScript[name]
|
||||||
}
|
}
|
||||||
return nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
|
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
|
||||||
@ -891,14 +904,31 @@ func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, e
|
|||||||
name = name[1:]
|
name = name[1:]
|
||||||
}
|
}
|
||||||
|
|
||||||
tab := unicodeTable(name)
|
tab, fold := unicodeTable(name)
|
||||||
if tab == nil {
|
if tab == nil {
|
||||||
return nil, "", &Error{ErrInvalidCharRange, seq}
|
return nil, "", &Error{ErrInvalidCharRange, seq}
|
||||||
}
|
}
|
||||||
if sign > 0 {
|
|
||||||
r = appendTable(r, tab)
|
if p.flags&FoldCase == 0 || fold == nil {
|
||||||
|
if sign > 0 {
|
||||||
|
r = appendTable(r, tab)
|
||||||
|
} else {
|
||||||
|
r = appendNegatedTable(r, tab)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
r = appendNegatedTable(r, tab)
|
// Merge and clean tab and fold in a temporary buffer.
|
||||||
|
// This is necessary for the negative case and just tidy
|
||||||
|
// for the positive case.
|
||||||
|
tmp := p.tmpClass[:0]
|
||||||
|
tmp = appendTable(tmp, tab)
|
||||||
|
tmp = appendTable(tmp, fold)
|
||||||
|
p.tmpClass = tmp
|
||||||
|
tmp = cleanClass(&p.tmpClass)
|
||||||
|
if sign > 0 {
|
||||||
|
r = appendClass(r, tmp)
|
||||||
|
} else {
|
||||||
|
r = appendNegatedClass(r, tmp)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return r, t, nil
|
return r, t, nil
|
||||||
}
|
}
|
||||||
@ -979,7 +1009,11 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
|
|||||||
return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
|
return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class = appendRange(class, lo, hi)
|
if p.flags&FoldCase == 0 {
|
||||||
|
class = appendRange(class, lo, hi)
|
||||||
|
} else {
|
||||||
|
class = appendFoldedRange(class, lo, hi)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
t = t[1:] // chop ]
|
t = t[1:] // chop ]
|
||||||
|
|
||||||
@ -999,10 +1033,15 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
|
|||||||
// cleanClass sorts the ranges (pairs of elements of r),
|
// cleanClass sorts the ranges (pairs of elements of r),
|
||||||
// merges them, and eliminates duplicates.
|
// merges them, and eliminates duplicates.
|
||||||
func cleanClass(rp *[]int) []int {
|
func cleanClass(rp *[]int) []int {
|
||||||
|
|
||||||
// Sort by lo increasing, hi decreasing to break ties.
|
// Sort by lo increasing, hi decreasing to break ties.
|
||||||
sort.Sort(ranges{rp})
|
sort.Sort(ranges{rp})
|
||||||
|
|
||||||
r := *rp
|
r := *rp
|
||||||
|
if len(r) < 2 {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
// Merge abutting, overlapping.
|
// Merge abutting, overlapping.
|
||||||
w := 2 // write index
|
w := 2 // write index
|
||||||
for i := 2; i < len(r); i += 2 {
|
for i := 2; i < len(r); i += 2 {
|
||||||
@ -1025,23 +1064,71 @@ func cleanClass(rp *[]int) []int {
|
|||||||
|
|
||||||
// appendRange returns the result of appending the range lo-hi to the class r.
|
// appendRange returns the result of appending the range lo-hi to the class r.
|
||||||
func appendRange(r []int, lo, hi int) []int {
|
func appendRange(r []int, lo, hi int) []int {
|
||||||
// Expand last range if overlaps or abuts.
|
// Expand last range or next to last range if it overlaps or abuts.
|
||||||
if n := len(r); n > 0 {
|
// Checking two ranges helps when appending case-folded
|
||||||
rlo, rhi := r[n-2], r[n-1]
|
// alphabets, so that one range can be expanding A-Z and the
|
||||||
if lo <= rhi+1 && rlo <= hi+1 {
|
// other expanding a-z.
|
||||||
if lo < rlo {
|
n := len(r)
|
||||||
r[n-2] = lo
|
for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
|
||||||
|
if n >= i {
|
||||||
|
rlo, rhi := r[n-i], r[n-i+1]
|
||||||
|
if lo <= rhi+1 && rlo <= hi+1 {
|
||||||
|
if lo < rlo {
|
||||||
|
r[n-i] = lo
|
||||||
|
}
|
||||||
|
if hi > rhi {
|
||||||
|
r[n-i+1] = hi
|
||||||
|
}
|
||||||
|
return r
|
||||||
}
|
}
|
||||||
if hi > rhi {
|
|
||||||
r[n-1] = hi
|
|
||||||
}
|
|
||||||
return r
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return append(r, lo, hi)
|
return append(r, lo, hi)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
// minimum and maximum runes involved in folding.
|
||||||
|
// checked during test.
|
||||||
|
minFold = 0x0041
|
||||||
|
maxFold = 0x1044f
|
||||||
|
)
|
||||||
|
|
||||||
|
// appendFoldedRange returns the result of appending the range lo-hi
|
||||||
|
// and its case folding-equivalent runes to the class r.
|
||||||
|
func appendFoldedRange(r []int, lo, hi int) []int {
|
||||||
|
// Optimizations.
|
||||||
|
if lo <= minFold && hi >= maxFold {
|
||||||
|
// Range is full: folding can't add more.
|
||||||
|
return appendRange(r, lo, hi)
|
||||||
|
}
|
||||||
|
if hi < minFold || lo > maxFold {
|
||||||
|
// Range is outside folding possibilities.
|
||||||
|
return appendRange(r, lo, hi)
|
||||||
|
}
|
||||||
|
if lo < minFold {
|
||||||
|
// [lo, minFold-1] needs no folding.
|
||||||
|
r = appendRange(r, lo, minFold-1)
|
||||||
|
lo = minFold
|
||||||
|
}
|
||||||
|
if hi > maxFold {
|
||||||
|
// [maxFold+1, hi] needs no folding.
|
||||||
|
r = appendRange(r, maxFold+1, hi)
|
||||||
|
hi = maxFold
|
||||||
|
}
|
||||||
|
|
||||||
|
// Brute force. Depend on appendRange to coalesce ranges on the fly.
|
||||||
|
for c := lo; c <= hi; c++ {
|
||||||
|
r = appendRange(r, c, c)
|
||||||
|
f := unicode.SimpleFold(c)
|
||||||
|
for f != c {
|
||||||
|
r = appendRange(r, f, f)
|
||||||
|
f = unicode.SimpleFold(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
// appendClass returns the result of appending the class x to the class r.
|
// appendClass returns the result of appending the class x to the class r.
|
||||||
// It assume x is clean.
|
// It assume x is clean.
|
||||||
func appendClass(r []int, x []int) []int {
|
func appendClass(r []int, x []int) []int {
|
||||||
@ -1051,6 +1138,14 @@ func appendClass(r []int, x []int) []int {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// appendFolded returns the result of appending the case folding of the class x to the class r.
|
||||||
|
func appendFoldedClass(r []int, x []int) []int {
|
||||||
|
for i := 0; i < len(x); i += 2 {
|
||||||
|
r = appendFoldedRange(r, x[i], x[i+1])
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
|
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
|
||||||
// It assumes x is clean.
|
// It assumes x is clean.
|
||||||
func appendNegatedClass(r []int, x []int) []int {
|
func appendNegatedClass(r []int, x []int) []int {
|
||||||
|
@ -74,18 +74,18 @@ var parseTests = []struct {
|
|||||||
{"[a-z]", "cc{0x61-0x7a}"},
|
{"[a-z]", "cc{0x61-0x7a}"},
|
||||||
{"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
|
{"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
|
||||||
{"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
|
{"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
|
||||||
// { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
{"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
|
||||||
// { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
{"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
|
||||||
// { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
{"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
|
||||||
// { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
{"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
|
||||||
{"\\d", "cc{0x30-0x39}"},
|
{"\\d", "cc{0x30-0x39}"},
|
||||||
{"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
|
{"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
|
||||||
{"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
|
{"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
|
||||||
{"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
|
{"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
|
||||||
{"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
|
{"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
|
||||||
{"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
|
{"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
|
||||||
// { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
|
{"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"},
|
||||||
// { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
{"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
|
||||||
{"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
|
{"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
|
||||||
// { "\\C", "byte{}" },
|
// { "\\C", "byte{}" },
|
||||||
|
|
||||||
@ -100,6 +100,13 @@ var parseTests = []struct {
|
|||||||
{"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
|
{"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
|
||||||
{"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
|
{"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
|
||||||
{"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
|
{"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
|
||||||
|
{"\\p{Lu}", mkCharClass(unicode.IsUpper)},
|
||||||
|
{"[\\p{Lu}]", mkCharClass(unicode.IsUpper)},
|
||||||
|
{"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)},
|
||||||
|
|
||||||
|
// Hex, octal.
|
||||||
|
{"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"},
|
||||||
|
{"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"},
|
||||||
|
|
||||||
// More interesting regular expressions.
|
// More interesting regular expressions.
|
||||||
// { "a{,2}", "str{a{,2}}" },
|
// { "a{,2}", "str{a{,2}}" },
|
||||||
@ -270,3 +277,69 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) {
|
|||||||
}
|
}
|
||||||
b.WriteByte('}')
|
b.WriteByte('}')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mkCharClass(f func(int) bool) string {
|
||||||
|
re := &Regexp{Op: OpCharClass}
|
||||||
|
lo := -1
|
||||||
|
for i := 0; i <= unicode.MaxRune; i++ {
|
||||||
|
if f(i) {
|
||||||
|
if lo < 0 {
|
||||||
|
lo = i
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if lo >= 0 {
|
||||||
|
re.Rune = append(re.Rune, lo, i-1)
|
||||||
|
lo = -1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if lo >= 0 {
|
||||||
|
re.Rune = append(re.Rune, lo, unicode.MaxRune)
|
||||||
|
}
|
||||||
|
return dump(re)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isUpperFold(rune int) bool {
|
||||||
|
if unicode.IsUpper(rune) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
c := unicode.SimpleFold(rune)
|
||||||
|
for c != rune {
|
||||||
|
if unicode.IsUpper(c) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
c = unicode.SimpleFold(c)
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFoldConstants(t *testing.T) {
|
||||||
|
last := -1
|
||||||
|
for i := 0; i <= unicode.MaxRune; i++ {
|
||||||
|
if unicode.SimpleFold(i) == i {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if last == -1 && minFold != i {
|
||||||
|
t.Errorf("minFold=%#U should be %#U", minFold, i)
|
||||||
|
}
|
||||||
|
last = i
|
||||||
|
}
|
||||||
|
if maxFold != last {
|
||||||
|
t.Errorf("maxFold=%#U should be %#U", maxFold, last)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAppendRangeCollapse(t *testing.T) {
|
||||||
|
// AppendRange should collapse each of the new ranges
|
||||||
|
// into the earlier ones (it looks back two ranges), so that
|
||||||
|
// the slice never grows very large.
|
||||||
|
// Note that we are not calling cleanClass.
|
||||||
|
var r []int
|
||||||
|
for i := 'A'; i <= 'Z'; i++ {
|
||||||
|
r = appendRange(r, i, i)
|
||||||
|
r = appendRange(r, i+'a'-'A', i+'a'-'A')
|
||||||
|
}
|
||||||
|
if string(r) != "AZaz" {
|
||||||
|
t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user