From 7e1a3e9f209d33eff36eb6876e0505a300be9ba6 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 29 Jun 2011 00:55:37 -0400 Subject: [PATCH] exp/regexp/syntax: incremental concat, alternate Also reuse of *Regexp nodes. I believe this is the end of the parser. The only non-execution code that remains is the code to expand x{3,5} into simpler operations. R=sam.thorogood, r CC=golang-dev https://golang.org/cl/4629078 --- src/pkg/exp/regexp/syntax/parse.go | 297 +++++++++++++++++++----- src/pkg/exp/regexp/syntax/parse_test.go | 241 ++++++++++--------- src/pkg/exp/regexp/syntax/regexp.go | 2 + 3 files changed, 365 insertions(+), 175 deletions(-) diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go index cbde6c60418..ae40d5fc94b 100644 --- a/src/pkg/exp/regexp/syntax/parse.go +++ b/src/pkg/exp/regexp/syntax/parse.go @@ -79,29 +79,110 @@ const ( type parser struct { flags Flags // parse mode flags stack []*Regexp // stack of parsed expressions - numCap int // number of capturing groups seen + free *Regexp + numCap int // number of capturing groups seen wholeRegexp string tmpClass []int // temporary char class work space } +func (p *parser) newRegexp(op Op) *Regexp { + re := p.free + if re != nil { + p.free = re.Sub0[0] + *re = Regexp{} + } else { + re = new(Regexp) + } + re.Op = op + return re +} + +func (p *parser) reuse(re *Regexp) { + re.Sub0[0] = p.free + p.free = re +} + // Parse stack manipulation. // push pushes the regexp re onto the parse stack and returns the regexp. func (p *parser) push(re *Regexp) *Regexp { - // TODO: automatic concatenation - // TODO: turn character class into literal // TODO: compute simple + if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { + // Single rune. + if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { + return nil + } + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags &^ FoldCase + } else if re.Op == OpCharClass && len(re.Rune) == 4 && + re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && + unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || + re.Op == OpCharClass && len(re.Rune) == 2 && + re.Rune[0]+1 == re.Rune[1] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && + unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { + // Case-insensitive rune like [Aa] or [Δδ]. + if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { + return nil + } + + // Rewrite as (case-insensitive) literal. + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags | FoldCase + } else { + // Incremental concatenation. + p.maybeConcat(-1, 0) + } + p.stack = append(p.stack, re) return re } -// newLiteral returns a new OpLiteral Regexp with the given flags -func newLiteral(r int, flags Flags) *Regexp { - re := &Regexp{ - Op: OpLiteral, - Flags: flags, +// maybeConcat implements incremental concatenation +// of literal runes into string nodes. The parser calls this +// before each push, so only the top fragment of the stack +// might need processing. Since this is called before a push, +// the topmost literal is no longer subject to operators like * +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0 and there's a node left over, maybeConcat uses it +// to push r with the given flags. +// maybeConcat reports whether r was pushed. +func (p *parser) maybeConcat(r int, flags Flags) bool { + n := len(p.stack) + if n < 2 { + return false } + + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { + return false + } + + // Push re1 into re2. + re2.Rune = append(re2.Rune, re1.Rune...) + + // Reuse re1 if possible. + if r >= 0 { + re1.Rune = re1.Rune0[:1] + re1.Rune[0] = r + re1.Flags = flags + return true + } + + p.stack = p.stack[:n-1] + p.reuse(re1) + return false // did not push r +} + +// newLiteral returns a new OpLiteral Regexp with the given flags +func (p *parser) newLiteral(r int, flags Flags) *Regexp { + re := p.newRegexp(OpLiteral) + re.Flags = flags re.Rune0[0] = r re.Rune = re.Rune0[:1] return re @@ -109,14 +190,16 @@ func newLiteral(r int, flags Flags) *Regexp { // literal pushes a literal regexp for the rune r on the stack // and returns that regexp. -func (p *parser) literal(r int) *Regexp { - return p.push(newLiteral(r, p.flags)) +func (p *parser) literal(r int) { + p.push(p.newLiteral(r, p.flags)) } // op pushes a regexp with the given op onto the stack // and returns that regexp. func (p *parser) op(op Op) *Regexp { - return p.push(&Regexp{Op: op, Flags: p.flags}) + re := p.newRegexp(op) + re.Flags = p.flags + return p.push(re) } // repeat replaces the top stack element with itself repeated @@ -140,12 +223,10 @@ func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (strin return "", &Error{ErrMissingRepeatArgument, opstr} } sub := p.stack[n-1] - re := &Regexp{ - Op: op, - Min: min, - Max: max, - Flags: flags, - } + re := p.newRegexp(op) + re.Min = min + re.Max = max + re.Flags = flags re.Sub = re.Sub0[:1] re.Sub[0] = sub p.stack[n-1] = re @@ -154,60 +235,97 @@ func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (strin // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. func (p *parser) concat() *Regexp { - // TODO: Flatten concats. + p.maybeConcat(-1, 0) // Scan down to find pseudo-operator | or (. i := len(p.stack) for i > 0 && p.stack[i-1].Op < opPseudo { i-- } - sub := p.stack[i:] + subs := p.stack[i:] p.stack = p.stack[:i] - var re *Regexp - switch len(sub) { - case 0: - re = &Regexp{Op: OpEmptyMatch} - case 1: - re = sub[0] - default: - re = &Regexp{Op: OpConcat} - re.Sub = append(re.Sub0[:0], sub...) + // Empty concatenation is special case. + if len(subs) == 0 { + return p.push(p.newRegexp(OpEmptyMatch)) } - return p.push(re) + + return p.collapse(subs, OpConcat) } // alternate replaces the top of the stack (above the topmost '(') with its alternation. func (p *parser) alternate() *Regexp { - // TODO: Flatten alternates. - // Scan down to find pseudo-operator (. // There are no | above (. i := len(p.stack) for i > 0 && p.stack[i-1].Op < opPseudo { i-- } - sub := p.stack[i:] + subs := p.stack[i:] p.stack = p.stack[:i] - var re *Regexp - switch len(sub) { - case 0: - re = &Regexp{Op: OpNoMatch} - case 1: - re = sub[0] - default: - re = &Regexp{Op: OpAlternate} - re.Sub = append(re.Sub0[:0], sub...) + // Make sure top class is clean. + // All the others already are (see swapVerticalBar). + if len(subs) > 0 { + cleanAlt(subs[len(subs)-1]) + } + + // Empty alternate is special case + // (shouldn't happen but easy to handle). + if len(subs) == 0 { + return p.push(p.newRegexp(OpNoMatch)) + } + + return p.collapse(subs, OpAlternate) +} + +// cleanAlt cleans re for eventual inclusion in an alternation. +func cleanAlt(re *Regexp) { + switch re.Op { + case OpCharClass: + re.Rune = cleanClass(&re.Rune) + if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyChar + return + } + if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyCharNotNL + return + } + if cap(re.Rune)-len(re.Rune) > 100 { + // re.Rune will not grow any more. + // Make a copy or inline to reclaim storage. + re.Rune = append(re.Rune0[:0], re.Rune...) + } + } +} + +// collapse pushes the result of applying op to sub +// onto the stack. If sub contains op nodes, they all +// get flattened into a single node. +// sub points into p.stack so it cannot be kept. +func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { + if len(subs) == 1 { + return p.push(subs[0]) + } + re := p.newRegexp(op) + re.Sub = re.Sub0[:0] + for _, sub := range subs { + if sub.Op == op { + re.Sub = append(re.Sub, sub.Sub...) + p.reuse(sub) + } else { + re.Sub = append(re.Sub, sub) + } } return p.push(re) } func literalRegexp(s string, flags Flags) *Regexp { - re := &Regexp{ - Op: OpLiteral, - Flags: flags, - } + re := &Regexp{Op: OpLiteral} + re.Flags = flags re.Rune = re.Rune0[:0] // use local storage for small strings for _, c := range s { if len(re.Rune) >= cap(re.Rune) { @@ -265,7 +383,6 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) { p.op(opLeftParen).Cap = p.numCap t = t[1:] case '|': - p.concat() if err = p.parseVerticalBar(); err != nil { return nil, err } @@ -361,7 +478,8 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) { } } - re := &Regexp{Op: OpCharClass, Flags: p.flags} + re := p.newRegexp(OpCharClass) + re.Flags = p.flags // Look for Unicode character group like \p{Han} if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { @@ -381,12 +499,10 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) { if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { re.Rune = r t = rest - // TODO: Handle FoldCase flag. p.push(re) break BigSwitch } - - // TODO: Give re back to parser's pool. + p.reuse(re) // Ordinary single-character escape. if c, t, err = p.parseEscape(t); err != nil { @@ -592,6 +708,35 @@ func (p *parser) parseInt(s string) (n int, rest string, ok bool) { return } +// can this be represented as a character class? +// single-rune literal string, char class, ., and .|\n. +func isCharClass(re *Regexp) bool { + return re.Op == OpLiteral && len(re.Rune) == 1 || + re.Op == OpCharClass || + re.Op == OpAnyCharNotNL || + re.Op == OpAnyChar +} + +// does re match r? +func matchRune(re *Regexp, r int) bool { + switch re.Op { + case OpLiteral: + return len(re.Rune) == 1 && re.Rune[0] == r + case OpCharClass: + for i := 0; i < len(re.Rune); i += 2 { + if re.Rune[i] <= r && r <= re.Rune[i+1] { + return true + } + } + return false + case OpAnyCharNotNL: + return r != '\n' + case OpAnyChar: + return true + } + return false +} + // parseVerticalBar handles a | in the input. func (p *parser) parseVerticalBar() os.Error { p.concat() @@ -611,10 +756,55 @@ func (p *parser) parseVerticalBar() os.Error { // swapVerticalBar swaps the two and returns true. // Otherwise it returns false. func (p *parser) swapVerticalBar() bool { - if n := len(p.stack); n >= 2 { + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + n := len(p.stack) + if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { + re1 := p.stack[n-1] + re3 := p.stack[n-3] + // Make re3 the more complex of the two. + if re1.Op > re3.Op { + re1, re3 = re3, re1 + p.stack[n-3] = re3 + } + switch re3.Op { + case OpAnyChar: + // re1 doesn't add anything. + case OpAnyCharNotNL: + // re1 might add \n + if matchRune(re1, '\n') { + re3.Op = OpAnyChar + } + case OpCharClass: + // re1 is simpler, so either literal or char class + if re1.Op == OpLiteral { + re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0]) + } else { + re3.Rune = appendClass(re3.Rune, re1.Rune) + } + case OpLiteral: + // both literal + if re1.Rune[0] == re3.Rune[0] { + break + } + re3.Op = OpCharClass + re3.Rune = append(re3.Rune, re3.Rune[0]) + re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0]) + } + p.reuse(re1) + p.stack = p.stack[:n-1] + return true + } + + if n >= 2 { re1 := p.stack[n-1] re2 := p.stack[n-2] if re2.Op == opVerticalBar { + if n >= 3 { + // Now out of reach. + // Clean opportunistically. + cleanAlt(p.stack[n-3]) + } p.stack[n-2] = re1 p.stack[n-1] = re2 return true @@ -937,7 +1127,8 @@ func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, e // and pushes it onto the parse stack. func (p *parser) parseClass(s string) (rest string, err os.Error) { t := s[1:] // chop [ - re := &Regexp{Op: OpCharClass, Flags: p.flags} + re := p.newRegexp(OpCharClass) + re.Flags = p.flags re.Rune = re.Rune0[:0] sign := +1 @@ -1017,8 +1208,6 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) { } t = t[1:] // chop ] - // TODO: Handle FoldCase flag. - // Use &re.Rune instead of &class to avoid allocation. re.Rune = class class = cleanClass(&re.Rune) diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go index 49380697945..51856b613eb 100644 --- a/src/pkg/exp/regexp/syntax/parse_test.go +++ b/src/pkg/exp/regexp/syntax/parse_test.go @@ -16,148 +16,146 @@ var parseTests = []struct { Dump string }{ // Base cases - {"a", "lit{a}"}, - {"a.", "cat{lit{a}dot{}}"}, - {"a.b", "cat{lit{a}dot{}lit{b}}"}, - // { "ab", "str{ab}" }, - {"ab", "cat{lit{a}lit{b}}"}, - {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"}, - // { "abc", "str{abc}" }, - {"abc", "cat{lit{a}lit{b}lit{c}}"}, - {"a|^", "alt{lit{a}bol{}}"}, - // { "a|b", "cc{0x61-0x62}" }, - {"a|b", "alt{lit{a}lit{b}}"}, - {"(a)", "cap{lit{a}}"}, - {"(a)|b", "alt{cap{lit{a}}lit{b}}"}, - {"a*", "star{lit{a}}"}, - {"a+", "plus{lit{a}}"}, - {"a?", "que{lit{a}}"}, - {"a{2}", "rep{2,2 lit{a}}"}, - {"a{2,3}", "rep{2,3 lit{a}}"}, - {"a{2,}", "rep{2,-1 lit{a}}"}, - {"a*?", "nstar{lit{a}}"}, - {"a+?", "nplus{lit{a}}"}, - {"a??", "nque{lit{a}}"}, - {"a{2}?", "nrep{2,2 lit{a}}"}, - {"a{2,3}?", "nrep{2,3 lit{a}}"}, - {"a{2,}?", "nrep{2,-1 lit{a}}"}, - {"", "emp{}"}, - // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored - {"|", "alt{emp{}emp{}}"}, - {"|x|", "alt{emp{}lit{x}emp{}}"}, - {".", "dot{}"}, - {"^", "bol{}"}, - {"$", "eol{}"}, - {"\\|", "lit{|}"}, - {"\\(", "lit{(}"}, - {"\\)", "lit{)}"}, - {"\\*", "lit{*}"}, - {"\\+", "lit{+}"}, - {"\\?", "lit{?}"}, - {"{", "lit{{}"}, - {"}", "lit{}}"}, - {"\\.", "lit{.}"}, - {"\\^", "lit{^}"}, - {"\\$", "lit{$}"}, - {"\\\\", "lit{\\}"}, - {"[ace]", "cc{0x61 0x63 0x65}"}, - {"[abc]", "cc{0x61-0x63}"}, - {"[a-z]", "cc{0x61-0x7a}"}, - // { "[a]", "lit{a}" }, - {"[a]", "cc{0x61}"}, - {"\\-", "lit{-}"}, - {"-", "lit{-}"}, - {"\\_", "lit{_}"}, + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + {``, `emp{}`}, + // { `|`, `emp{}` }, // alt{emp{}emp{}} but got factored + {`|`, `alt{emp{}emp{}}`}, + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, // Posix and Perl extensions - {"[[:lower:]]", "cc{0x61-0x7a}"}, - {"[a-z]", "cc{0x61-0x7a}"}, - {"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"}, - {"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"}, - {"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"}, - {"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"}, - {"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"}, - {"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"}, - {"\\d", "cc{0x30-0x39}"}, - {"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"}, - {"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"}, - {"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"}, - {"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"}, - {"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"}, - {"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"}, - {"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"}, - {"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"}, - // { "\\C", "byte{}" }, + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never // Unicode, negatives, and a double negative. - {"\\p{Braille}", "cc{0x2800-0x28ff}"}, - {"\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"}, - {"\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"}, - {"\\P{^Braille}", "cc{0x2800-0x28ff}"}, - {"\\pZ", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"}, - {"[\\p{Braille}]", "cc{0x2800-0x28ff}"}, - {"[\\P{Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"}, - {"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"}, - {"[\\P{^Braille}]", "cc{0x2800-0x28ff}"}, - {"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"}, - {"\\p{Lu}", mkCharClass(unicode.IsUpper)}, - {"[\\p{Lu}]", mkCharClass(unicode.IsUpper)}, - {"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)}, + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, // Hex, octal. - {"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"}, - {"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"}, + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, // More interesting regular expressions. - // { "a{,2}", "str{a{,2}}" }, - // { "\\.\\^\\$\\\\", "str{.^$\\}" }, - {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"}, - {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"}, - {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8 - {"a*{", "cat{star{lit{a}}lit{{}}"}, + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, // Test precedences - // { "(?:ab)*", "star{str{ab}}" }, - // { "(ab)*", "star{cap{str{ab}}}" }, - // { "ab|cd", "alt{str{ab}str{cd}}" }, - // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, - {"(?:ab)*", "star{cat{lit{a}lit{b}}}"}, - {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"}, - {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"}, - {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"}, + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, // Test flattening. - {"(?:a)", "lit{a}"}, - // { "(?:ab)(?:cd)", "str{abcd}" }, - // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, - // { "a|.", "dot{}" }, - // { ".|a", "dot{}" }, + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, // Test Perl quoted literals - {"\\Q+|*?{[\\E", "str{+|*?{[}"}, - {"\\Q+\\E+", "plus{lit{+}}"}, - {"\\Q\\\\E", "lit{\\}"}, - {"\\Q\\\\\\E", "str{\\\\}"}, + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, // Test Perl \A and \z - {"(?m)^", "bol{}"}, - {"(?m)$", "eol{}"}, - {"(?-m)^", "bot{}"}, - {"(?-m)$", "eot{}"}, - {"(?m)\\A", "bot{}"}, - {"(?m)\\z", "eot{\\z}"}, - {"(?-m)\\A", "bot{}"}, - {"(?-m)\\z", "eot{\\z}"}, + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, // Test named captures - {"(?Pa)", "cap{name:lit{a}}"}, + {`(?Pa)`, `cap{name:lit{a}}`}, // Case-folded literals - // { "[Aa]", "litfold{a}" }, + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, // Strings - // { "abcde", "str{abcde}" }, - // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, } const testFlags = MatchNL | PerlX | UnicodeGroups @@ -230,8 +228,9 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) { } if re.Flags&FoldCase != 0 { for _, r := range re.Rune { - if unicode.ToUpper(r) != r { + if unicode.SimpleFold(r) != r { b.WriteString("fold") + break } } } diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go index a0c465967ff..248ace503c8 100644 --- a/src/pkg/exp/regexp/syntax/regexp.go +++ b/src/pkg/exp/regexp/syntax/regexp.go @@ -33,6 +33,8 @@ type Regexp struct { type Op uint8 // Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). const ( OpNoMatch Op = 1 + iota // matches no strings