|  | // Copyright 2011 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | package syntax | 
|  |  | 
|  | import ( | 
|  | "sort" | 
|  | "strings" | 
|  | "unicode" | 
|  | "unicode/utf8" | 
|  | ) | 
|  |  | 
|  | // An Error describes a failure to parse a regular expression | 
|  | // and gives the offending expression. | 
|  | type Error struct { | 
|  | Code ErrorCode | 
|  | Expr string | 
|  | } | 
|  |  | 
|  | func (e *Error) Error() string { | 
|  | return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" | 
|  | } | 
|  |  | 
|  | // An ErrorCode describes a failure to parse a regular expression. | 
|  | type ErrorCode string | 
|  |  | 
|  | const ( | 
|  | // Unexpected error | 
|  | ErrInternalError ErrorCode = "regexp/syntax: internal error" | 
|  |  | 
|  | // Parse errors | 
|  | ErrInvalidCharClass      ErrorCode = "invalid character class" | 
|  | ErrInvalidCharRange      ErrorCode = "invalid character class range" | 
|  | ErrInvalidEscape         ErrorCode = "invalid escape sequence" | 
|  | ErrInvalidNamedCapture   ErrorCode = "invalid named capture" | 
|  | ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax" | 
|  | ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator" | 
|  | ErrInvalidRepeatSize     ErrorCode = "invalid repeat count" | 
|  | ErrInvalidUTF8           ErrorCode = "invalid UTF-8" | 
|  | ErrMissingBracket        ErrorCode = "missing closing ]" | 
|  | ErrMissingParen          ErrorCode = "missing closing )" | 
|  | ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" | 
|  | ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression" | 
|  | ErrUnexpectedParen       ErrorCode = "unexpected )" | 
|  | ErrNestingDepth          ErrorCode = "expression nests too deeply" | 
|  | ErrLarge                 ErrorCode = "expression too large" | 
|  | ) | 
|  |  | 
|  | func (e ErrorCode) String() string { | 
|  | return string(e) | 
|  | } | 
|  |  | 
|  | // Flags control the behavior of the parser and record information about regexp context. | 
|  | type Flags uint16 | 
|  |  | 
|  | const ( | 
|  | FoldCase      Flags = 1 << iota // case-insensitive match | 
|  | Literal                         // treat pattern as literal string | 
|  | ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline | 
|  | DotNL                           // allow . to match newline | 
|  | OneLine                         // treat ^ and $ as only matching at beginning and end of text | 
|  | NonGreedy                       // make repetition operators default to non-greedy | 
|  | PerlX                           // allow Perl extensions | 
|  | UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation | 
|  | WasDollar                       // regexp OpEndText was $, not \z | 
|  | Simple                          // regexp contains no counted repetition | 
|  |  | 
|  | MatchNL = ClassNL | DotNL | 
|  |  | 
|  | Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible | 
|  | POSIX Flags = 0                                         // POSIX syntax | 
|  | ) | 
|  |  | 
|  | // Pseudo-ops for parsing stack. | 
|  | const ( | 
|  | opLeftParen = opPseudo + iota | 
|  | opVerticalBar | 
|  | ) | 
|  |  | 
|  | // maxHeight is the maximum height of a regexp parse tree. | 
|  | // It is somewhat arbitrarily chosen, but the idea is to be large enough | 
|  | // that no one will actually hit in real use but at the same time small enough | 
|  | // that recursion on the Regexp tree will not hit the 1GB Go stack limit. | 
|  | // The maximum amount of stack for a single recursive frame is probably | 
|  | // closer to 1kB, so this could potentially be raised, but it seems unlikely | 
|  | // that people have regexps nested even this deeply. | 
|  | // We ran a test on Google's C++ code base and turned up only | 
|  | // a single use case with depth > 100; it had depth 128. | 
|  | // Using depth 1000 should be plenty of margin. | 
|  | // As an optimization, we don't even bother calculating heights | 
|  | // until we've allocated at least maxHeight Regexp structures. | 
|  | const maxHeight = 1000 | 
|  |  | 
|  | // maxSize is the maximum size of a compiled regexp in Insts. | 
|  | // It too is somewhat arbitrarily chosen, but the idea is to be large enough | 
|  | // to allow significant regexps while at the same time small enough that | 
|  | // the compiled form will not take up too much memory. | 
|  | // 128 MB is enough for a 3.3 million Inst structures, which roughly | 
|  | // corresponds to a 3.3 MB regexp. | 
|  | const ( | 
|  | maxSize  = 128 << 20 / instSize | 
|  | instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words | 
|  | ) | 
|  |  | 
|  | // maxRunes is the maximum number of runes allowed in a regexp tree | 
|  | // counting the runes in all the nodes. | 
|  | // Ignoring character classes p.numRunes is always less than the length of the regexp. | 
|  | // Character classes can make it much larger: each \pL adds 1292 runes. | 
|  | // 128 MB is enough for 32M runes, which is over 26k \pL instances. | 
|  | // Note that repetitions do not make copies of the rune slices, | 
|  | // so \pL{1000} is only one rune slice, not 1000. | 
|  | // We could keep a cache of character classes we've seen, | 
|  | // so that all the \pL we see use the same rune list, | 
|  | // but that doesn't remove the problem entirely: | 
|  | // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. | 
|  | // And because the Rune slice is exposed directly in the Regexp, | 
|  | // there is not an opportunity to change the representation to allow | 
|  | // partial sharing between different character classes. | 
|  | // So the limit is the best we can do. | 
|  | const ( | 
|  | maxRunes = 128 << 20 / runeSize | 
|  | runeSize = 4 // rune is int32 | 
|  | ) | 
|  |  | 
|  | type parser struct { | 
|  | flags       Flags     // parse mode flags | 
|  | stack       []*Regexp // stack of parsed expressions | 
|  | free        *Regexp | 
|  | numCap      int // number of capturing groups seen | 
|  | wholeRegexp string | 
|  | tmpClass    []rune            // temporary char class work space | 
|  | numRegexp   int               // number of regexps allocated | 
|  | numRunes    int               // number of runes in char classes | 
|  | repeats     int64             // product of all repetitions seen | 
|  | height      map[*Regexp]int   // regexp height, for height limit check | 
|  | size        map[*Regexp]int64 // regexp compiled size, for size limit check | 
|  | } | 
|  |  | 
|  | func (p *parser) newRegexp(op Op) *Regexp { | 
|  | re := p.free | 
|  | if re != nil { | 
|  | p.free = re.Sub0[0] | 
|  | *re = Regexp{} | 
|  | } else { | 
|  | re = new(Regexp) | 
|  | p.numRegexp++ | 
|  | } | 
|  | re.Op = op | 
|  | return re | 
|  | } | 
|  |  | 
|  | func (p *parser) reuse(re *Regexp) { | 
|  | if p.height != nil { | 
|  | delete(p.height, re) | 
|  | } | 
|  | re.Sub0[0] = p.free | 
|  | p.free = re | 
|  | } | 
|  |  | 
|  | func (p *parser) checkLimits(re *Regexp) { | 
|  | if p.numRunes > maxRunes { | 
|  | panic(ErrLarge) | 
|  | } | 
|  | p.checkSize(re) | 
|  | p.checkHeight(re) | 
|  | } | 
|  |  | 
|  | func (p *parser) checkSize(re *Regexp) { | 
|  | if p.size == nil { | 
|  | // We haven't started tracking size yet. | 
|  | // Do a relatively cheap check to see if we need to start. | 
|  | // Maintain the product of all the repeats we've seen | 
|  | // and don't track if the total number of regexp nodes | 
|  | // we've seen times the repeat product is in budget. | 
|  | if p.repeats == 0 { | 
|  | p.repeats = 1 | 
|  | } | 
|  | if re.Op == OpRepeat { | 
|  | n := re.Max | 
|  | if n == -1 { | 
|  | n = re.Min | 
|  | } | 
|  | if n <= 0 { | 
|  | n = 1 | 
|  | } | 
|  | if int64(n) > maxSize/p.repeats { | 
|  | p.repeats = maxSize | 
|  | } else { | 
|  | p.repeats *= int64(n) | 
|  | } | 
|  | } | 
|  | if int64(p.numRegexp) < maxSize/p.repeats { | 
|  | return | 
|  | } | 
|  |  | 
|  | // We need to start tracking size. | 
|  | // Make the map and belatedly populate it | 
|  | // with info about everything we've constructed so far. | 
|  | p.size = make(map[*Regexp]int64) | 
|  | for _, re := range p.stack { | 
|  | p.checkSize(re) | 
|  | } | 
|  | } | 
|  |  | 
|  | if p.calcSize(re, true) > maxSize { | 
|  | panic(ErrLarge) | 
|  | } | 
|  | } | 
|  |  | 
|  | func (p *parser) calcSize(re *Regexp, force bool) int64 { | 
|  | if !force { | 
|  | if size, ok := p.size[re]; ok { | 
|  | return size | 
|  | } | 
|  | } | 
|  |  | 
|  | var size int64 | 
|  | switch re.Op { | 
|  | case OpLiteral: | 
|  | size = int64(len(re.Rune)) | 
|  | case OpCapture, OpStar: | 
|  | // star can be 1+ or 2+; assume 2 pessimistically | 
|  | size = 2 + p.calcSize(re.Sub[0], false) | 
|  | case OpPlus, OpQuest: | 
|  | size = 1 + p.calcSize(re.Sub[0], false) | 
|  | case OpConcat: | 
|  | for _, sub := range re.Sub { | 
|  | size += p.calcSize(sub, false) | 
|  | } | 
|  | case OpAlternate: | 
|  | for _, sub := range re.Sub { | 
|  | size += p.calcSize(sub, false) | 
|  | } | 
|  | if len(re.Sub) > 1 { | 
|  | size += int64(len(re.Sub)) - 1 | 
|  | } | 
|  | case OpRepeat: | 
|  | sub := p.calcSize(re.Sub[0], false) | 
|  | if re.Max == -1 { | 
|  | if re.Min == 0 { | 
|  | size = 2 + sub // x* | 
|  | } else { | 
|  | size = 1 + int64(re.Min)*sub // xxx+ | 
|  | } | 
|  | break | 
|  | } | 
|  | // x{2,5} = xx(x(x(x)?)?)? | 
|  | size = int64(re.Max)*sub + int64(re.Max-re.Min) | 
|  | } | 
|  |  | 
|  | if size < 1 { | 
|  | size = 1 | 
|  | } | 
|  | p.size[re] = size | 
|  | return size | 
|  | } | 
|  |  | 
|  | func (p *parser) checkHeight(re *Regexp) { | 
|  | if p.numRegexp < maxHeight { | 
|  | return | 
|  | } | 
|  | if p.height == nil { | 
|  | p.height = make(map[*Regexp]int) | 
|  | for _, re := range p.stack { | 
|  | p.checkHeight(re) | 
|  | } | 
|  | } | 
|  | if p.calcHeight(re, true) > maxHeight { | 
|  | panic(ErrNestingDepth) | 
|  | } | 
|  | } | 
|  |  | 
|  | func (p *parser) calcHeight(re *Regexp, force bool) int { | 
|  | if !force { | 
|  | if h, ok := p.height[re]; ok { | 
|  | return h | 
|  | } | 
|  | } | 
|  | h := 1 | 
|  | for _, sub := range re.Sub { | 
|  | hsub := p.calcHeight(sub, false) | 
|  | if h < 1+hsub { | 
|  | h = 1 + hsub | 
|  | } | 
|  | } | 
|  | p.height[re] = h | 
|  | return h | 
|  | } | 
|  |  | 
|  | // Parse stack manipulation. | 
|  |  | 
|  | // push pushes the regexp re onto the parse stack and returns the regexp. | 
|  | func (p *parser) push(re *Regexp) *Regexp { | 
|  | p.numRunes += len(re.Rune) | 
|  | if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { | 
|  | // Single rune. | 
|  | if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { | 
|  | return nil | 
|  | } | 
|  | re.Op = OpLiteral | 
|  | re.Rune = re.Rune[:1] | 
|  | re.Flags = p.flags &^ FoldCase | 
|  | } else if re.Op == OpCharClass && len(re.Rune) == 4 && | 
|  | re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && | 
|  | unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && | 
|  | unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || | 
|  | re.Op == OpCharClass && len(re.Rune) == 2 && | 
|  | re.Rune[0]+1 == re.Rune[1] && | 
|  | unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && | 
|  | unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { | 
|  | // Case-insensitive rune like [Aa] or [Δδ]. | 
|  | if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { | 
|  | return nil | 
|  | } | 
|  |  | 
|  | // Rewrite as (case-insensitive) literal. | 
|  | re.Op = OpLiteral | 
|  | re.Rune = re.Rune[:1] | 
|  | re.Flags = p.flags | FoldCase | 
|  | } else { | 
|  | // Incremental concatenation. | 
|  | p.maybeConcat(-1, 0) | 
|  | } | 
|  |  | 
|  | p.stack = append(p.stack, re) | 
|  | p.checkLimits(re) | 
|  | return re | 
|  | } | 
|  |  | 
|  | // maybeConcat implements incremental concatenation | 
|  | // of literal runes into string nodes. The parser calls this | 
|  | // before each push, so only the top fragment of the stack | 
|  | // might need processing. Since this is called before a push, | 
|  | // the topmost literal is no longer subject to operators like * | 
|  | // (Otherwise ab* would turn into (ab)*.) | 
|  | // If r >= 0 and there's a node left over, maybeConcat uses it | 
|  | // to push r with the given flags. | 
|  | // maybeConcat reports whether r was pushed. | 
|  | func (p *parser) maybeConcat(r rune, flags Flags) bool { | 
|  | n := len(p.stack) | 
|  | if n < 2 { | 
|  | return false | 
|  | } | 
|  |  | 
|  | re1 := p.stack[n-1] | 
|  | re2 := p.stack[n-2] | 
|  | if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { | 
|  | return false | 
|  | } | 
|  |  | 
|  | // Push re1 into re2. | 
|  | re2.Rune = append(re2.Rune, re1.Rune...) | 
|  |  | 
|  | // Reuse re1 if possible. | 
|  | if r >= 0 { | 
|  | re1.Rune = re1.Rune0[:1] | 
|  | re1.Rune[0] = r | 
|  | re1.Flags = flags | 
|  | return true | 
|  | } | 
|  |  | 
|  | p.stack = p.stack[:n-1] | 
|  | p.reuse(re1) | 
|  | return false // did not push r | 
|  | } | 
|  |  | 
|  | // literal pushes a literal regexp for the rune r on the stack. | 
|  | func (p *parser) literal(r rune) { | 
|  | re := p.newRegexp(OpLiteral) | 
|  | re.Flags = p.flags | 
|  | if p.flags&FoldCase != 0 { | 
|  | r = minFoldRune(r) | 
|  | } | 
|  | re.Rune0[0] = r | 
|  | re.Rune = re.Rune0[:1] | 
|  | p.push(re) | 
|  | } | 
|  |  | 
|  | // minFoldRune returns the minimum rune fold-equivalent to r. | 
|  | func minFoldRune(r rune) rune { | 
|  | if r < minFold || r > maxFold { | 
|  | return r | 
|  | } | 
|  | m := r | 
|  | r0 := r | 
|  | for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { | 
|  | m = min(m, r) | 
|  | } | 
|  | return m | 
|  | } | 
|  |  | 
|  | // op pushes a regexp with the given op onto the stack | 
|  | // and returns that regexp. | 
|  | func (p *parser) op(op Op) *Regexp { | 
|  | re := p.newRegexp(op) | 
|  | re.Flags = p.flags | 
|  | return p.push(re) | 
|  | } | 
|  |  | 
|  | // repeat replaces the top stack element with itself repeated according to op, min, max. | 
|  | // before is the regexp suffix starting at the repetition operator. | 
|  | // after is the regexp suffix following after the repetition operator. | 
|  | // repeat returns an updated 'after' and an error, if any. | 
|  | func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { | 
|  | flags := p.flags | 
|  | if p.flags&PerlX != 0 { | 
|  | if len(after) > 0 && after[0] == '?' { | 
|  | after = after[1:] | 
|  | flags ^= NonGreedy | 
|  | } | 
|  | if lastRepeat != "" { | 
|  | // In Perl it is not allowed to stack repetition operators: | 
|  | // a** is a syntax error, not a doubled star, and a++ means | 
|  | // something else entirely, which we don't support! | 
|  | return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} | 
|  | } | 
|  | } | 
|  | n := len(p.stack) | 
|  | if n == 0 { | 
|  | return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} | 
|  | } | 
|  | sub := p.stack[n-1] | 
|  | if sub.Op >= opPseudo { | 
|  | return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} | 
|  | } | 
|  |  | 
|  | re := p.newRegexp(op) | 
|  | re.Min = min | 
|  | re.Max = max | 
|  | re.Flags = flags | 
|  | re.Sub = re.Sub0[:1] | 
|  | re.Sub[0] = sub | 
|  | p.stack[n-1] = re | 
|  | p.checkLimits(re) | 
|  |  | 
|  | if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { | 
|  | return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} | 
|  | } | 
|  |  | 
|  | return after, nil | 
|  | } | 
|  |  | 
|  | // repeatIsValid reports whether the repetition re is valid. | 
|  | // Valid means that the combination of the top-level repetition | 
|  | // and any inner repetitions does not exceed n copies of the | 
|  | // innermost thing. | 
|  | // This function rewalks the regexp tree and is called for every repetition, | 
|  | // so we have to worry about inducing quadratic behavior in the parser. | 
|  | // We avoid this by only calling repeatIsValid when min or max >= 2. | 
|  | // In that case the depth of any >= 2 nesting can only get to 9 without | 
|  | // triggering a parse error, so each subtree can only be rewalked 9 times. | 
|  | func repeatIsValid(re *Regexp, n int) bool { | 
|  | if re.Op == OpRepeat { | 
|  | m := re.Max | 
|  | if m == 0 { | 
|  | return true | 
|  | } | 
|  | if m < 0 { | 
|  | m = re.Min | 
|  | } | 
|  | if m > n { | 
|  | return false | 
|  | } | 
|  | if m > 0 { | 
|  | n /= m | 
|  | } | 
|  | } | 
|  | for _, sub := range re.Sub { | 
|  | if !repeatIsValid(sub, n) { | 
|  | return false | 
|  | } | 
|  | } | 
|  | return true | 
|  | } | 
|  |  | 
|  | // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. | 
|  | func (p *parser) concat() *Regexp { | 
|  | p.maybeConcat(-1, 0) | 
|  |  | 
|  | // Scan down to find pseudo-operator | or (. | 
|  | i := len(p.stack) | 
|  | for i > 0 && p.stack[i-1].Op < opPseudo { | 
|  | i-- | 
|  | } | 
|  | subs := p.stack[i:] | 
|  | p.stack = p.stack[:i] | 
|  |  | 
|  | // Empty concatenation is special case. | 
|  | if len(subs) == 0 { | 
|  | return p.push(p.newRegexp(OpEmptyMatch)) | 
|  | } | 
|  |  | 
|  | return p.push(p.collapse(subs, OpConcat)) | 
|  | } | 
|  |  | 
|  | // alternate replaces the top of the stack (above the topmost '(') with its alternation. | 
|  | func (p *parser) alternate() *Regexp { | 
|  | // Scan down to find pseudo-operator (. | 
|  | // There are no | above (. | 
|  | i := len(p.stack) | 
|  | for i > 0 && p.stack[i-1].Op < opPseudo { | 
|  | i-- | 
|  | } | 
|  | subs := p.stack[i:] | 
|  | p.stack = p.stack[:i] | 
|  |  | 
|  | // Make sure top class is clean. | 
|  | // All the others already are (see swapVerticalBar). | 
|  | if len(subs) > 0 { | 
|  | cleanAlt(subs[len(subs)-1]) | 
|  | } | 
|  |  | 
|  | // Empty alternate is special case | 
|  | // (shouldn't happen but easy to handle). | 
|  | if len(subs) == 0 { | 
|  | return p.push(p.newRegexp(OpNoMatch)) | 
|  | } | 
|  |  | 
|  | return p.push(p.collapse(subs, OpAlternate)) | 
|  | } | 
|  |  | 
|  | // cleanAlt cleans re for eventual inclusion in an alternation. | 
|  | func cleanAlt(re *Regexp) { | 
|  | switch re.Op { | 
|  | case OpCharClass: | 
|  | re.Rune = cleanClass(&re.Rune) | 
|  | if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { | 
|  | re.Rune = nil | 
|  | re.Op = OpAnyChar | 
|  | return | 
|  | } | 
|  | if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { | 
|  | re.Rune = nil | 
|  | re.Op = OpAnyCharNotNL | 
|  | return | 
|  | } | 
|  | if cap(re.Rune)-len(re.Rune) > 100 { | 
|  | // re.Rune will not grow any more. | 
|  | // Make a copy or inline to reclaim storage. | 
|  | re.Rune = append(re.Rune0[:0], re.Rune...) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // collapse returns the result of applying op to sub. | 
|  | // If sub contains op nodes, they all get hoisted up | 
|  | // so that there is never a concat of a concat or an | 
|  | // alternate of an alternate. | 
|  | func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { | 
|  | if len(subs) == 1 { | 
|  | return subs[0] | 
|  | } | 
|  | re := p.newRegexp(op) | 
|  | re.Sub = re.Sub0[:0] | 
|  | for _, sub := range subs { | 
|  | if sub.Op == op { | 
|  | re.Sub = append(re.Sub, sub.Sub...) | 
|  | p.reuse(sub) | 
|  | } else { | 
|  | re.Sub = append(re.Sub, sub) | 
|  | } | 
|  | } | 
|  | if op == OpAlternate { | 
|  | re.Sub = p.factor(re.Sub) | 
|  | if len(re.Sub) == 1 { | 
|  | old := re | 
|  | re = re.Sub[0] | 
|  | p.reuse(old) | 
|  | } | 
|  | } | 
|  | return re | 
|  | } | 
|  |  | 
|  | // factor factors common prefixes from the alternation list sub. | 
|  | // It returns a replacement list that reuses the same storage and | 
|  | // frees (passes to p.reuse) any removed *Regexps. | 
|  | // | 
|  | // For example, | 
|  | // | 
|  | //	ABC|ABD|AEF|BCX|BCY | 
|  | // | 
|  | // simplifies by literal prefix extraction to | 
|  | // | 
|  | //	A(B(C|D)|EF)|BC(X|Y) | 
|  | // | 
|  | // which simplifies by character class introduction to | 
|  | // | 
|  | //	A(B[CD]|EF)|BC[XY] | 
|  | func (p *parser) factor(sub []*Regexp) []*Regexp { | 
|  | if len(sub) < 2 { | 
|  | return sub | 
|  | } | 
|  |  | 
|  | // Round 1: Factor out common literal prefixes. | 
|  | var str []rune | 
|  | var strflags Flags | 
|  | start := 0 | 
|  | out := sub[:0] | 
|  | for i := 0; i <= len(sub); i++ { | 
|  | // Invariant: the Regexps that were in sub[0:start] have been | 
|  | // used or marked for reuse, and the slice space has been reused | 
|  | // for out (len(out) <= start). | 
|  | // | 
|  | // Invariant: sub[start:i] consists of regexps that all begin | 
|  | // with str as modified by strflags. | 
|  | var istr []rune | 
|  | var iflags Flags | 
|  | if i < len(sub) { | 
|  | istr, iflags = p.leadingString(sub[i]) | 
|  | if iflags == strflags { | 
|  | same := 0 | 
|  | for same < len(str) && same < len(istr) && str[same] == istr[same] { | 
|  | same++ | 
|  | } | 
|  | if same > 0 { | 
|  | // Matches at least one rune in current range. | 
|  | // Keep going around. | 
|  | str = str[:same] | 
|  | continue | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Found end of a run with common leading literal string: | 
|  | // sub[start:i] all begin with str[0:len(str)], but sub[i] | 
|  | // does not even begin with str[0]. | 
|  | // | 
|  | // Factor out common string and append factored expression to out. | 
|  | if i == start { | 
|  | // Nothing to do - run of length 0. | 
|  | } else if i == start+1 { | 
|  | // Just one: don't bother factoring. | 
|  | out = append(out, sub[start]) | 
|  | } else { | 
|  | // Construct factored form: prefix(suffix1|suffix2|...) | 
|  | prefix := p.newRegexp(OpLiteral) | 
|  | prefix.Flags = strflags | 
|  | prefix.Rune = append(prefix.Rune[:0], str...) | 
|  |  | 
|  | for j := start; j < i; j++ { | 
|  | sub[j] = p.removeLeadingString(sub[j], len(str)) | 
|  | p.checkLimits(sub[j]) | 
|  | } | 
|  | suffix := p.collapse(sub[start:i], OpAlternate) // recurse | 
|  |  | 
|  | re := p.newRegexp(OpConcat) | 
|  | re.Sub = append(re.Sub[:0], prefix, suffix) | 
|  | out = append(out, re) | 
|  | } | 
|  |  | 
|  | // Prepare for next iteration. | 
|  | start = i | 
|  | str = istr | 
|  | strflags = iflags | 
|  | } | 
|  | sub = out | 
|  |  | 
|  | // Round 2: Factor out common simple prefixes, | 
|  | // just the first piece of each concatenation. | 
|  | // This will be good enough a lot of the time. | 
|  | // | 
|  | // Complex subexpressions (e.g. involving quantifiers) | 
|  | // are not safe to factor because that collapses their | 
|  | // distinct paths through the automaton, which affects | 
|  | // correctness in some cases. | 
|  | start = 0 | 
|  | out = sub[:0] | 
|  | var first *Regexp | 
|  | for i := 0; i <= len(sub); i++ { | 
|  | // Invariant: the Regexps that were in sub[0:start] have been | 
|  | // used or marked for reuse, and the slice space has been reused | 
|  | // for out (len(out) <= start). | 
|  | // | 
|  | // Invariant: sub[start:i] consists of regexps that all begin with ifirst. | 
|  | var ifirst *Regexp | 
|  | if i < len(sub) { | 
|  | ifirst = p.leadingRegexp(sub[i]) | 
|  | if first != nil && first.Equal(ifirst) && | 
|  | // first must be a character class OR a fixed repeat of a character class. | 
|  | (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { | 
|  | continue | 
|  | } | 
|  | } | 
|  |  | 
|  | // Found end of a run with common leading regexp: | 
|  | // sub[start:i] all begin with first but sub[i] does not. | 
|  | // | 
|  | // Factor out common regexp and append factored expression to out. | 
|  | if i == start { | 
|  | // Nothing to do - run of length 0. | 
|  | } else if i == start+1 { | 
|  | // Just one: don't bother factoring. | 
|  | out = append(out, sub[start]) | 
|  | } else { | 
|  | // Construct factored form: prefix(suffix1|suffix2|...) | 
|  | prefix := first | 
|  | for j := start; j < i; j++ { | 
|  | reuse := j != start // prefix came from sub[start] | 
|  | sub[j] = p.removeLeadingRegexp(sub[j], reuse) | 
|  | p.checkLimits(sub[j]) | 
|  | } | 
|  | suffix := p.collapse(sub[start:i], OpAlternate) // recurse | 
|  |  | 
|  | re := p.newRegexp(OpConcat) | 
|  | re.Sub = append(re.Sub[:0], prefix, suffix) | 
|  | out = append(out, re) | 
|  | } | 
|  |  | 
|  | // Prepare for next iteration. | 
|  | start = i | 
|  | first = ifirst | 
|  | } | 
|  | sub = out | 
|  |  | 
|  | // Round 3: Collapse runs of single literals into character classes. | 
|  | start = 0 | 
|  | out = sub[:0] | 
|  | for i := 0; i <= len(sub); i++ { | 
|  | // Invariant: the Regexps that were in sub[0:start] have been | 
|  | // used or marked for reuse, and the slice space has been reused | 
|  | // for out (len(out) <= start). | 
|  | // | 
|  | // Invariant: sub[start:i] consists of regexps that are either | 
|  | // literal runes or character classes. | 
|  | if i < len(sub) && isCharClass(sub[i]) { | 
|  | continue | 
|  | } | 
|  |  | 
|  | // sub[i] is not a char or char class; | 
|  | // emit char class for sub[start:i]... | 
|  | if i == start { | 
|  | // Nothing to do - run of length 0. | 
|  | } else if i == start+1 { | 
|  | out = append(out, sub[start]) | 
|  | } else { | 
|  | // Make new char class. | 
|  | // Start with most complex regexp in sub[start]. | 
|  | max := start | 
|  | for j := start + 1; j < i; j++ { | 
|  | if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { | 
|  | max = j | 
|  | } | 
|  | } | 
|  | sub[start], sub[max] = sub[max], sub[start] | 
|  |  | 
|  | for j := start + 1; j < i; j++ { | 
|  | mergeCharClass(sub[start], sub[j]) | 
|  | p.reuse(sub[j]) | 
|  | } | 
|  | cleanAlt(sub[start]) | 
|  | out = append(out, sub[start]) | 
|  | } | 
|  |  | 
|  | // ... and then emit sub[i]. | 
|  | if i < len(sub) { | 
|  | out = append(out, sub[i]) | 
|  | } | 
|  | start = i + 1 | 
|  | } | 
|  | sub = out | 
|  |  | 
|  | // Round 4: Collapse runs of empty matches into a single empty match. | 
|  | start = 0 | 
|  | out = sub[:0] | 
|  | for i := range sub { | 
|  | if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { | 
|  | continue | 
|  | } | 
|  | out = append(out, sub[i]) | 
|  | } | 
|  | sub = out | 
|  |  | 
|  | return sub | 
|  | } | 
|  |  | 
|  | // leadingString returns the leading literal string that re begins with. | 
|  | // The string refers to storage in re or its children. | 
|  | func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { | 
|  | if re.Op == OpConcat && len(re.Sub) > 0 { | 
|  | re = re.Sub[0] | 
|  | } | 
|  | if re.Op != OpLiteral { | 
|  | return nil, 0 | 
|  | } | 
|  | return re.Rune, re.Flags & FoldCase | 
|  | } | 
|  |  | 
|  | // removeLeadingString removes the first n leading runes | 
|  | // from the beginning of re. It returns the replacement for re. | 
|  | func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { | 
|  | if re.Op == OpConcat && len(re.Sub) > 0 { | 
|  | // Removing a leading string in a concatenation | 
|  | // might simplify the concatenation. | 
|  | sub := re.Sub[0] | 
|  | sub = p.removeLeadingString(sub, n) | 
|  | re.Sub[0] = sub | 
|  | if sub.Op == OpEmptyMatch { | 
|  | p.reuse(sub) | 
|  | switch len(re.Sub) { | 
|  | case 0, 1: | 
|  | // Impossible but handle. | 
|  | re.Op = OpEmptyMatch | 
|  | re.Sub = nil | 
|  | case 2: | 
|  | old := re | 
|  | re = re.Sub[1] | 
|  | p.reuse(old) | 
|  | default: | 
|  | copy(re.Sub, re.Sub[1:]) | 
|  | re.Sub = re.Sub[:len(re.Sub)-1] | 
|  | } | 
|  | } | 
|  | return re | 
|  | } | 
|  |  | 
|  | if re.Op == OpLiteral { | 
|  | re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] | 
|  | if len(re.Rune) == 0 { | 
|  | re.Op = OpEmptyMatch | 
|  | } | 
|  | } | 
|  | return re | 
|  | } | 
|  |  | 
|  | // leadingRegexp returns the leading regexp that re begins with. | 
|  | // The regexp refers to storage in re or its children. | 
|  | func (p *parser) leadingRegexp(re *Regexp) *Regexp { | 
|  | if re.Op == OpEmptyMatch { | 
|  | return nil | 
|  | } | 
|  | if re.Op == OpConcat && len(re.Sub) > 0 { | 
|  | sub := re.Sub[0] | 
|  | if sub.Op == OpEmptyMatch { | 
|  | return nil | 
|  | } | 
|  | return sub | 
|  | } | 
|  | return re | 
|  | } | 
|  |  | 
|  | // removeLeadingRegexp removes the leading regexp in re. | 
|  | // It returns the replacement for re. | 
|  | // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. | 
|  | func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { | 
|  | if re.Op == OpConcat && len(re.Sub) > 0 { | 
|  | if reuse { | 
|  | p.reuse(re.Sub[0]) | 
|  | } | 
|  | re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] | 
|  | switch len(re.Sub) { | 
|  | case 0: | 
|  | re.Op = OpEmptyMatch | 
|  | re.Sub = nil | 
|  | case 1: | 
|  | old := re | 
|  | re = re.Sub[0] | 
|  | p.reuse(old) | 
|  | } | 
|  | return re | 
|  | } | 
|  | if reuse { | 
|  | p.reuse(re) | 
|  | } | 
|  | return p.newRegexp(OpEmptyMatch) | 
|  | } | 
|  |  | 
|  | func literalRegexp(s string, flags Flags) *Regexp { | 
|  | re := &Regexp{Op: OpLiteral} | 
|  | re.Flags = flags | 
|  | re.Rune = re.Rune0[:0] // use local storage for small strings | 
|  | for _, c := range s { | 
|  | if len(re.Rune) >= cap(re.Rune) { | 
|  | // string is too long to fit in Rune0.  let Go handle it | 
|  | re.Rune = []rune(s) | 
|  | break | 
|  | } | 
|  | re.Rune = append(re.Rune, c) | 
|  | } | 
|  | return re | 
|  | } | 
|  |  | 
|  | // Parsing. | 
|  |  | 
|  | // Parse parses a regular expression string s, controlled by the specified | 
|  | // Flags, and returns a regular expression parse tree. The syntax is | 
|  | // described in the top-level comment. | 
|  | func Parse(s string, flags Flags) (*Regexp, error) { | 
|  | return parse(s, flags) | 
|  | } | 
|  |  | 
|  | func parse(s string, flags Flags) (_ *Regexp, err error) { | 
|  | defer func() { | 
|  | switch r := recover(); r { | 
|  | default: | 
|  | panic(r) | 
|  | case nil: | 
|  | // ok | 
|  | case ErrLarge: // too big | 
|  | err = &Error{Code: ErrLarge, Expr: s} | 
|  | case ErrNestingDepth: | 
|  | err = &Error{Code: ErrNestingDepth, Expr: s} | 
|  | } | 
|  | }() | 
|  |  | 
|  | if flags&Literal != 0 { | 
|  | // Trivial parser for literal string. | 
|  | if err := checkUTF8(s); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | return literalRegexp(s, flags), nil | 
|  | } | 
|  |  | 
|  | // Otherwise, must do real work. | 
|  | var ( | 
|  | p          parser | 
|  | c          rune | 
|  | op         Op | 
|  | lastRepeat string | 
|  | ) | 
|  | p.flags = flags | 
|  | p.wholeRegexp = s | 
|  | t := s | 
|  | for t != "" { | 
|  | repeat := "" | 
|  | BigSwitch: | 
|  | switch t[0] { | 
|  | default: | 
|  | if c, t, err = nextRune(t); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | p.literal(c) | 
|  |  | 
|  | case '(': | 
|  | if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { | 
|  | // Flag changes and non-capturing groups. | 
|  | if t, err = p.parsePerlFlags(t); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | break | 
|  | } | 
|  | p.numCap++ | 
|  | p.op(opLeftParen).Cap = p.numCap | 
|  | t = t[1:] | 
|  | case '|': | 
|  | if err = p.parseVerticalBar(); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | t = t[1:] | 
|  | case ')': | 
|  | if err = p.parseRightParen(); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | t = t[1:] | 
|  | case '^': | 
|  | if p.flags&OneLine != 0 { | 
|  | p.op(OpBeginText) | 
|  | } else { | 
|  | p.op(OpBeginLine) | 
|  | } | 
|  | t = t[1:] | 
|  | case '$': | 
|  | if p.flags&OneLine != 0 { | 
|  | p.op(OpEndText).Flags |= WasDollar | 
|  | } else { | 
|  | p.op(OpEndLine) | 
|  | } | 
|  | t = t[1:] | 
|  | case '.': | 
|  | if p.flags&DotNL != 0 { | 
|  | p.op(OpAnyChar) | 
|  | } else { | 
|  | p.op(OpAnyCharNotNL) | 
|  | } | 
|  | t = t[1:] | 
|  | case '[': | 
|  | if t, err = p.parseClass(t); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | case '*', '+', '?': | 
|  | before := t | 
|  | switch t[0] { | 
|  | case '*': | 
|  | op = OpStar | 
|  | case '+': | 
|  | op = OpPlus | 
|  | case '?': | 
|  | op = OpQuest | 
|  | } | 
|  | after := t[1:] | 
|  | if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | repeat = before | 
|  | t = after | 
|  | case '{': | 
|  | op = OpRepeat | 
|  | before := t | 
|  | min, max, after, ok := p.parseRepeat(t) | 
|  | if !ok { | 
|  | // If the repeat cannot be parsed, { is a literal. | 
|  | p.literal('{') | 
|  | t = t[1:] | 
|  | break | 
|  | } | 
|  | if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { | 
|  | // Numbers were too big, or max is present and min > max. | 
|  | return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} | 
|  | } | 
|  | if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | repeat = before | 
|  | t = after | 
|  | case '\\': | 
|  | if p.flags&PerlX != 0 && len(t) >= 2 { | 
|  | switch t[1] { | 
|  | case 'A': | 
|  | p.op(OpBeginText) | 
|  | t = t[2:] | 
|  | break BigSwitch | 
|  | case 'b': | 
|  | p.op(OpWordBoundary) | 
|  | t = t[2:] | 
|  | break BigSwitch | 
|  | case 'B': | 
|  | p.op(OpNoWordBoundary) | 
|  | t = t[2:] | 
|  | break BigSwitch | 
|  | case 'C': | 
|  | // any byte; not supported | 
|  | return nil, &Error{ErrInvalidEscape, t[:2]} | 
|  | case 'Q': | 
|  | // \Q ... \E: the ... is always literals | 
|  | var lit string | 
|  | lit, t, _ = strings.Cut(t[2:], `\E`) | 
|  | for lit != "" { | 
|  | c, rest, err := nextRune(lit) | 
|  | if err != nil { | 
|  | return nil, err | 
|  | } | 
|  | p.literal(c) | 
|  | lit = rest | 
|  | } | 
|  | break BigSwitch | 
|  | case 'z': | 
|  | p.op(OpEndText) | 
|  | t = t[2:] | 
|  | break BigSwitch | 
|  | } | 
|  | } | 
|  |  | 
|  | re := p.newRegexp(OpCharClass) | 
|  | re.Flags = p.flags | 
|  |  | 
|  | // Look for Unicode character group like \p{Han} | 
|  | if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { | 
|  | r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) | 
|  | if err != nil { | 
|  | return nil, err | 
|  | } | 
|  | if r != nil { | 
|  | re.Rune = r | 
|  | t = rest | 
|  | p.push(re) | 
|  | break BigSwitch | 
|  | } | 
|  | } | 
|  |  | 
|  | // Perl character class escape. | 
|  | if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { | 
|  | re.Rune = r | 
|  | t = rest | 
|  | p.push(re) | 
|  | break BigSwitch | 
|  | } | 
|  | p.reuse(re) | 
|  |  | 
|  | // Ordinary single-character escape. | 
|  | if c, t, err = p.parseEscape(t); err != nil { | 
|  | return nil, err | 
|  | } | 
|  | p.literal(c) | 
|  | } | 
|  | lastRepeat = repeat | 
|  | } | 
|  |  | 
|  | p.concat() | 
|  | if p.swapVerticalBar() { | 
|  | // pop vertical bar | 
|  | p.stack = p.stack[:len(p.stack)-1] | 
|  | } | 
|  | p.alternate() | 
|  |  | 
|  | n := len(p.stack) | 
|  | if n != 1 { | 
|  | return nil, &Error{ErrMissingParen, s} | 
|  | } | 
|  | return p.stack[0], nil | 
|  | } | 
|  |  | 
|  | // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. | 
|  | // If s is not of that form, it returns ok == false. | 
|  | // If s has the right form but the values are too big, it returns min == -1, ok == true. | 
|  | func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { | 
|  | if s == "" || s[0] != '{' { | 
|  | return | 
|  | } | 
|  | s = s[1:] | 
|  | var ok1 bool | 
|  | if min, s, ok1 = p.parseInt(s); !ok1 { | 
|  | return | 
|  | } | 
|  | if s == "" { | 
|  | return | 
|  | } | 
|  | if s[0] != ',' { | 
|  | max = min | 
|  | } else { | 
|  | s = s[1:] | 
|  | if s == "" { | 
|  | return | 
|  | } | 
|  | if s[0] == '}' { | 
|  | max = -1 | 
|  | } else if max, s, ok1 = p.parseInt(s); !ok1 { | 
|  | return | 
|  | } else if max < 0 { | 
|  | // parseInt found too big a number | 
|  | min = -1 | 
|  | } | 
|  | } | 
|  | if s == "" || s[0] != '}' { | 
|  | return | 
|  | } | 
|  | rest = s[1:] | 
|  | ok = true | 
|  | return | 
|  | } | 
|  |  | 
|  | // parsePerlFlags parses a Perl flag setting or non-capturing group or both, | 
|  | // like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state. | 
|  | // The caller must have ensured that s begins with "(?". | 
|  | func (p *parser) parsePerlFlags(s string) (rest string, err error) { | 
|  | t := s | 
|  |  | 
|  | // Check for named captures, first introduced in Python's regexp library. | 
|  | // As usual, there are three slightly different syntaxes: | 
|  | // | 
|  | //   (?P<name>expr)   the original, introduced by Python | 
|  | //   (?<name>expr)    the .NET alteration, adopted by Perl 5.10 | 
|  | //   (?'name'expr)    another .NET alteration, adopted by Perl 5.10 | 
|  | // | 
|  | // Perl 5.10 gave in and implemented the Python version too, | 
|  | // but they claim that the last two are the preferred forms. | 
|  | // PCRE and languages based on it (specifically, PHP and Ruby) | 
|  | // support all three as well. EcmaScript 4 uses only the Python form. | 
|  | // | 
|  | // In both the open source world (via Code Search) and the | 
|  | // Google source tree, (?P<expr>name) and (?<expr>name) are the | 
|  | // dominant forms of named captures and both are supported. | 
|  | startsWithP := len(t) > 4 && t[2] == 'P' && t[3] == '<' | 
|  | startsWithName := len(t) > 3 && t[2] == '<' | 
|  |  | 
|  | if startsWithP || startsWithName { | 
|  | // position of expr start | 
|  | exprStartPos := 4 | 
|  | if startsWithName { | 
|  | exprStartPos = 3 | 
|  | } | 
|  |  | 
|  | // Pull out name. | 
|  | end := strings.IndexRune(t, '>') | 
|  | if end < 0 { | 
|  | if err = checkUTF8(t); err != nil { | 
|  | return "", err | 
|  | } | 
|  | return "", &Error{ErrInvalidNamedCapture, s} | 
|  | } | 
|  |  | 
|  | capture := t[:end+1]        // "(?P<name>" or "(?<name>" | 
|  | name := t[exprStartPos:end] // "name" | 
|  | if err = checkUTF8(name); err != nil { | 
|  | return "", err | 
|  | } | 
|  | if !isValidCaptureName(name) { | 
|  | return "", &Error{ErrInvalidNamedCapture, capture} | 
|  | } | 
|  |  | 
|  | // Like ordinary capture, but named. | 
|  | p.numCap++ | 
|  | re := p.op(opLeftParen) | 
|  | re.Cap = p.numCap | 
|  | re.Name = name | 
|  | return t[end+1:], nil | 
|  | } | 
|  |  | 
|  | // Non-capturing group. Might also twiddle Perl flags. | 
|  | var c rune | 
|  | t = t[2:] // skip (? | 
|  | flags := p.flags | 
|  | sign := +1 | 
|  | sawFlag := false | 
|  | Loop: | 
|  | for t != "" { | 
|  | if c, t, err = nextRune(t); err != nil { | 
|  | return "", err | 
|  | } | 
|  | switch c { | 
|  | default: | 
|  | break Loop | 
|  |  | 
|  | // Flags. | 
|  | case 'i': | 
|  | flags |= FoldCase | 
|  | sawFlag = true | 
|  | case 'm': | 
|  | flags &^= OneLine | 
|  | sawFlag = true | 
|  | case 's': | 
|  | flags |= DotNL | 
|  | sawFlag = true | 
|  | case 'U': | 
|  | flags |= NonGreedy | 
|  | sawFlag = true | 
|  |  | 
|  | // Switch to negation. | 
|  | case '-': | 
|  | if sign < 0 { | 
|  | break Loop | 
|  | } | 
|  | sign = -1 | 
|  | // Invert flags so that | above turn into &^ and vice versa. | 
|  | // We'll invert flags again before using it below. | 
|  | flags = ^flags | 
|  | sawFlag = false | 
|  |  | 
|  | // End of flags, starting group or not. | 
|  | case ':', ')': | 
|  | if sign < 0 { | 
|  | if !sawFlag { | 
|  | break Loop | 
|  | } | 
|  | flags = ^flags | 
|  | } | 
|  | if c == ':' { | 
|  | // Open new group | 
|  | p.op(opLeftParen) | 
|  | } | 
|  | p.flags = flags | 
|  | return t, nil | 
|  | } | 
|  | } | 
|  |  | 
|  | return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} | 
|  | } | 
|  |  | 
|  | // isValidCaptureName reports whether name | 
|  | // is a valid capture name: [A-Za-z0-9_]+. | 
|  | // PCRE limits names to 32 bytes. | 
|  | // Python rejects names starting with digits. | 
|  | // We don't enforce either of those. | 
|  | func isValidCaptureName(name string) bool { | 
|  | if name == "" { | 
|  | return false | 
|  | } | 
|  | for _, c := range name { | 
|  | if c != '_' && !isalnum(c) { | 
|  | return false | 
|  | } | 
|  | } | 
|  | return true | 
|  | } | 
|  |  | 
|  | // parseInt parses a decimal integer. | 
|  | func (p *parser) parseInt(s string) (n int, rest string, ok bool) { | 
|  | if s == "" || s[0] < '0' || '9' < s[0] { | 
|  | return | 
|  | } | 
|  | // Disallow leading zeros. | 
|  | if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { | 
|  | return | 
|  | } | 
|  | t := s | 
|  | for s != "" && '0' <= s[0] && s[0] <= '9' { | 
|  | s = s[1:] | 
|  | } | 
|  | rest = s | 
|  | ok = true | 
|  | // Have digits, compute value. | 
|  | t = t[:len(t)-len(s)] | 
|  | for i := 0; i < len(t); i++ { | 
|  | // Avoid overflow. | 
|  | if n >= 1e8 { | 
|  | n = -1 | 
|  | break | 
|  | } | 
|  | n = n*10 + int(t[i]) - '0' | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | // can this be represented as a character class? | 
|  | // single-rune literal string, char class, ., and .|\n. | 
|  | func isCharClass(re *Regexp) bool { | 
|  | return re.Op == OpLiteral && len(re.Rune) == 1 || | 
|  | re.Op == OpCharClass || | 
|  | re.Op == OpAnyCharNotNL || | 
|  | re.Op == OpAnyChar | 
|  | } | 
|  |  | 
|  | // does re match r? | 
|  | func matchRune(re *Regexp, r rune) bool { | 
|  | switch re.Op { | 
|  | case OpLiteral: | 
|  | return len(re.Rune) == 1 && re.Rune[0] == r | 
|  | case OpCharClass: | 
|  | for i := 0; i < len(re.Rune); i += 2 { | 
|  | if re.Rune[i] <= r && r <= re.Rune[i+1] { | 
|  | return true | 
|  | } | 
|  | } | 
|  | return false | 
|  | case OpAnyCharNotNL: | 
|  | return r != '\n' | 
|  | case OpAnyChar: | 
|  | return true | 
|  | } | 
|  | return false | 
|  | } | 
|  |  | 
|  | // parseVerticalBar handles a | in the input. | 
|  | func (p *parser) parseVerticalBar() error { | 
|  | p.concat() | 
|  |  | 
|  | // The concatenation we just parsed is on top of the stack. | 
|  | // If it sits above an opVerticalBar, swap it below | 
|  | // (things below an opVerticalBar become an alternation). | 
|  | // Otherwise, push a new vertical bar. | 
|  | if !p.swapVerticalBar() { | 
|  | p.op(opVerticalBar) | 
|  | } | 
|  |  | 
|  | return nil | 
|  | } | 
|  |  | 
|  | // mergeCharClass makes dst = dst|src. | 
|  | // The caller must ensure that dst.Op >= src.Op, | 
|  | // to reduce the amount of copying. | 
|  | func mergeCharClass(dst, src *Regexp) { | 
|  | switch dst.Op { | 
|  | case OpAnyChar: | 
|  | // src doesn't add anything. | 
|  | case OpAnyCharNotNL: | 
|  | // src might add \n | 
|  | if matchRune(src, '\n') { | 
|  | dst.Op = OpAnyChar | 
|  | } | 
|  | case OpCharClass: | 
|  | // src is simpler, so either literal or char class | 
|  | if src.Op == OpLiteral { | 
|  | dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) | 
|  | } else { | 
|  | dst.Rune = appendClass(dst.Rune, src.Rune) | 
|  | } | 
|  | case OpLiteral: | 
|  | // both literal | 
|  | if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { | 
|  | break | 
|  | } | 
|  | dst.Op = OpCharClass | 
|  | dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) | 
|  | dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) | 
|  | } | 
|  | } | 
|  |  | 
|  | // If the top of the stack is an element followed by an opVerticalBar | 
|  | // swapVerticalBar swaps the two and returns true. | 
|  | // Otherwise it returns false. | 
|  | func (p *parser) swapVerticalBar() bool { | 
|  | // If above and below vertical bar are literal or char class, | 
|  | // can merge into a single char class. | 
|  | n := len(p.stack) | 
|  | if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { | 
|  | re1 := p.stack[n-1] | 
|  | re3 := p.stack[n-3] | 
|  | // Make re3 the more complex of the two. | 
|  | if re1.Op > re3.Op { | 
|  | re1, re3 = re3, re1 | 
|  | p.stack[n-3] = re3 | 
|  | } | 
|  | mergeCharClass(re3, re1) | 
|  | p.reuse(re1) | 
|  | p.stack = p.stack[:n-1] | 
|  | return true | 
|  | } | 
|  |  | 
|  | if n >= 2 { | 
|  | re1 := p.stack[n-1] | 
|  | re2 := p.stack[n-2] | 
|  | if re2.Op == opVerticalBar { | 
|  | if n >= 3 { | 
|  | // Now out of reach. | 
|  | // Clean opportunistically. | 
|  | cleanAlt(p.stack[n-3]) | 
|  | } | 
|  | p.stack[n-2] = re1 | 
|  | p.stack[n-1] = re2 | 
|  | return true | 
|  | } | 
|  | } | 
|  | return false | 
|  | } | 
|  |  | 
|  | // parseRightParen handles a ) in the input. | 
|  | func (p *parser) parseRightParen() error { | 
|  | p.concat() | 
|  | if p.swapVerticalBar() { | 
|  | // pop vertical bar | 
|  | p.stack = p.stack[:len(p.stack)-1] | 
|  | } | 
|  | p.alternate() | 
|  |  | 
|  | n := len(p.stack) | 
|  | if n < 2 { | 
|  | return &Error{ErrUnexpectedParen, p.wholeRegexp} | 
|  | } | 
|  | re1 := p.stack[n-1] | 
|  | re2 := p.stack[n-2] | 
|  | p.stack = p.stack[:n-2] | 
|  | if re2.Op != opLeftParen { | 
|  | return &Error{ErrUnexpectedParen, p.wholeRegexp} | 
|  | } | 
|  | // Restore flags at time of paren. | 
|  | p.flags = re2.Flags | 
|  | if re2.Cap == 0 { | 
|  | // Just for grouping. | 
|  | p.push(re1) | 
|  | } else { | 
|  | re2.Op = OpCapture | 
|  | re2.Sub = re2.Sub0[:1] | 
|  | re2.Sub[0] = re1 | 
|  | p.push(re2) | 
|  | } | 
|  | return nil | 
|  | } | 
|  |  | 
|  | // parseEscape parses an escape sequence at the beginning of s | 
|  | // and returns the rune. | 
|  | func (p *parser) parseEscape(s string) (r rune, rest string, err error) { | 
|  | t := s[1:] | 
|  | if t == "" { | 
|  | return 0, "", &Error{ErrTrailingBackslash, ""} | 
|  | } | 
|  | c, t, err := nextRune(t) | 
|  | if err != nil { | 
|  | return 0, "", err | 
|  | } | 
|  |  | 
|  | Switch: | 
|  | switch c { | 
|  | default: | 
|  | if c < utf8.RuneSelf && !isalnum(c) { | 
|  | // Escaped non-word characters are always themselves. | 
|  | // PCRE is not quite so rigorous: it accepts things like | 
|  | // \q, but we don't. We once rejected \_, but too many | 
|  | // programs and people insist on using it, so allow \_. | 
|  | return c, t, nil | 
|  | } | 
|  |  | 
|  | // Octal escapes. | 
|  | case '1', '2', '3', '4', '5', '6', '7': | 
|  | // Single non-zero digit is a backreference; not supported | 
|  | if t == "" || t[0] < '0' || t[0] > '7' { | 
|  | break | 
|  | } | 
|  | fallthrough | 
|  | case '0': | 
|  | // Consume up to three octal digits; already have one. | 
|  | r = c - '0' | 
|  | for i := 1; i < 3; i++ { | 
|  | if t == "" || t[0] < '0' || t[0] > '7' { | 
|  | break | 
|  | } | 
|  | r = r*8 + rune(t[0]) - '0' | 
|  | t = t[1:] | 
|  | } | 
|  | return r, t, nil | 
|  |  | 
|  | // Hexadecimal escapes. | 
|  | case 'x': | 
|  | if t == "" { | 
|  | break | 
|  | } | 
|  | if c, t, err = nextRune(t); err != nil { | 
|  | return 0, "", err | 
|  | } | 
|  | if c == '{' { | 
|  | // Any number of digits in braces. | 
|  | // Perl accepts any text at all; it ignores all text | 
|  | // after the first non-hex digit. We require only hex digits, | 
|  | // and at least one. | 
|  | nhex := 0 | 
|  | r = 0 | 
|  | for { | 
|  | if t == "" { | 
|  | break Switch | 
|  | } | 
|  | if c, t, err = nextRune(t); err != nil { | 
|  | return 0, "", err | 
|  | } | 
|  | if c == '}' { | 
|  | break | 
|  | } | 
|  | v := unhex(c) | 
|  | if v < 0 { | 
|  | break Switch | 
|  | } | 
|  | r = r*16 + v | 
|  | if r > unicode.MaxRune { | 
|  | break Switch | 
|  | } | 
|  | nhex++ | 
|  | } | 
|  | if nhex == 0 { | 
|  | break Switch | 
|  | } | 
|  | return r, t, nil | 
|  | } | 
|  |  | 
|  | // Easy case: two hex digits. | 
|  | x := unhex(c) | 
|  | if c, t, err = nextRune(t); err != nil { | 
|  | return 0, "", err | 
|  | } | 
|  | y := unhex(c) | 
|  | if x < 0 || y < 0 { | 
|  | break | 
|  | } | 
|  | return x*16 + y, t, nil | 
|  |  | 
|  | // C escapes. There is no case 'b', to avoid misparsing | 
|  | // the Perl word-boundary \b as the C backspace \b | 
|  | // when in POSIX mode. In Perl, /\b/ means word-boundary | 
|  | // but /[\b]/ means backspace. We don't support that. | 
|  | // If you want a backspace, embed a literal backspace | 
|  | // character or use \x08. | 
|  | case 'a': | 
|  | return '\a', t, err | 
|  | case 'f': | 
|  | return '\f', t, err | 
|  | case 'n': | 
|  | return '\n', t, err | 
|  | case 'r': | 
|  | return '\r', t, err | 
|  | case 't': | 
|  | return '\t', t, err | 
|  | case 'v': | 
|  | return '\v', t, err | 
|  | } | 
|  | return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} | 
|  | } | 
|  |  | 
|  | // parseClassChar parses a character class character at the beginning of s | 
|  | // and returns it. | 
|  | func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { | 
|  | if s == "" { | 
|  | return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} | 
|  | } | 
|  |  | 
|  | // Allow regular escape sequences even though | 
|  | // many need not be escaped in this context. | 
|  | if s[0] == '\\' { | 
|  | return p.parseEscape(s) | 
|  | } | 
|  |  | 
|  | return nextRune(s) | 
|  | } | 
|  |  | 
|  | type charGroup struct { | 
|  | sign  int | 
|  | class []rune | 
|  | } | 
|  |  | 
|  | // parsePerlClassEscape parses a leading Perl character class escape like \d | 
|  | // from the beginning of s. If one is present, it appends the characters to r | 
|  | // and returns the new slice r and the remainder of the string. | 
|  | func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { | 
|  | if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { | 
|  | return | 
|  | } | 
|  | g := perlGroup[s[0:2]] | 
|  | if g.sign == 0 { | 
|  | return | 
|  | } | 
|  | return p.appendGroup(r, g), s[2:] | 
|  | } | 
|  |  | 
|  | // parseNamedClass parses a leading POSIX named character class like [:alnum:] | 
|  | // from the beginning of s. If one is present, it appends the characters to r | 
|  | // and returns the new slice r and the remainder of the string. | 
|  | func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { | 
|  | if len(s) < 2 || s[0] != '[' || s[1] != ':' { | 
|  | return | 
|  | } | 
|  |  | 
|  | i := strings.Index(s[2:], ":]") | 
|  | if i < 0 { | 
|  | return | 
|  | } | 
|  | i += 2 | 
|  | name, s := s[0:i+2], s[i+2:] | 
|  | g := posixGroup[name] | 
|  | if g.sign == 0 { | 
|  | return nil, "", &Error{ErrInvalidCharRange, name} | 
|  | } | 
|  | return p.appendGroup(r, g), s, nil | 
|  | } | 
|  |  | 
|  | func (p *parser) appendGroup(r []rune, g charGroup) []rune { | 
|  | if p.flags&FoldCase == 0 { | 
|  | if g.sign < 0 { | 
|  | r = appendNegatedClass(r, g.class) | 
|  | } else { | 
|  | r = appendClass(r, g.class) | 
|  | } | 
|  | } else { | 
|  | tmp := p.tmpClass[:0] | 
|  | tmp = appendFoldedClass(tmp, g.class) | 
|  | p.tmpClass = tmp | 
|  | tmp = cleanClass(&p.tmpClass) | 
|  | if g.sign < 0 { | 
|  | r = appendNegatedClass(r, tmp) | 
|  | } else { | 
|  | r = appendClass(r, tmp) | 
|  | } | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | var anyTable = &unicode.RangeTable{ | 
|  | R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, | 
|  | R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, | 
|  | } | 
|  |  | 
|  | // unicodeTable returns the unicode.RangeTable identified by name | 
|  | // and the table of additional fold-equivalent code points. | 
|  | func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { | 
|  | // Special case: "Any" means any. | 
|  | if name == "Any" { | 
|  | return anyTable, anyTable | 
|  | } | 
|  | if t := unicode.Categories[name]; t != nil { | 
|  | return t, unicode.FoldCategory[name] | 
|  | } | 
|  | if t := unicode.Scripts[name]; t != nil { | 
|  | return t, unicode.FoldScript[name] | 
|  | } | 
|  | return nil, nil | 
|  | } | 
|  |  | 
|  | // parseUnicodeClass parses a leading Unicode character class like \p{Han} | 
|  | // from the beginning of s. If one is present, it appends the characters to r | 
|  | // and returns the new slice r and the remainder of the string. | 
|  | func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { | 
|  | if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { | 
|  | return | 
|  | } | 
|  |  | 
|  | // Committed to parse or return error. | 
|  | sign := +1 | 
|  | if s[1] == 'P' { | 
|  | sign = -1 | 
|  | } | 
|  | t := s[2:] | 
|  | c, t, err := nextRune(t) | 
|  | if err != nil { | 
|  | return | 
|  | } | 
|  | var seq, name string | 
|  | if c != '{' { | 
|  | // Single-letter name. | 
|  | seq = s[:len(s)-len(t)] | 
|  | name = seq[2:] | 
|  | } else { | 
|  | // Name is in braces. | 
|  | end := strings.IndexRune(s, '}') | 
|  | if end < 0 { | 
|  | if err = checkUTF8(s); err != nil { | 
|  | return | 
|  | } | 
|  | return nil, "", &Error{ErrInvalidCharRange, s} | 
|  | } | 
|  | seq, t = s[:end+1], s[end+1:] | 
|  | name = s[3:end] | 
|  | if err = checkUTF8(name); err != nil { | 
|  | return | 
|  | } | 
|  | } | 
|  |  | 
|  | // Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. | 
|  | if name != "" && name[0] == '^' { | 
|  | sign = -sign | 
|  | name = name[1:] | 
|  | } | 
|  |  | 
|  | tab, fold := unicodeTable(name) | 
|  | if tab == nil { | 
|  | return nil, "", &Error{ErrInvalidCharRange, seq} | 
|  | } | 
|  |  | 
|  | if p.flags&FoldCase == 0 || fold == nil { | 
|  | if sign > 0 { | 
|  | r = appendTable(r, tab) | 
|  | } else { | 
|  | r = appendNegatedTable(r, tab) | 
|  | } | 
|  | } else { | 
|  | // Merge and clean tab and fold in a temporary buffer. | 
|  | // This is necessary for the negative case and just tidy | 
|  | // for the positive case. | 
|  | tmp := p.tmpClass[:0] | 
|  | tmp = appendTable(tmp, tab) | 
|  | tmp = appendTable(tmp, fold) | 
|  | p.tmpClass = tmp | 
|  | tmp = cleanClass(&p.tmpClass) | 
|  | if sign > 0 { | 
|  | r = appendClass(r, tmp) | 
|  | } else { | 
|  | r = appendNegatedClass(r, tmp) | 
|  | } | 
|  | } | 
|  | return r, t, nil | 
|  | } | 
|  |  | 
|  | // parseClass parses a character class at the beginning of s | 
|  | // and pushes it onto the parse stack. | 
|  | func (p *parser) parseClass(s string) (rest string, err error) { | 
|  | t := s[1:] // chop [ | 
|  | re := p.newRegexp(OpCharClass) | 
|  | re.Flags = p.flags | 
|  | re.Rune = re.Rune0[:0] | 
|  |  | 
|  | sign := +1 | 
|  | if t != "" && t[0] == '^' { | 
|  | sign = -1 | 
|  | t = t[1:] | 
|  |  | 
|  | // If character class does not match \n, add it here, | 
|  | // so that negation later will do the right thing. | 
|  | if p.flags&ClassNL == 0 { | 
|  | re.Rune = append(re.Rune, '\n', '\n') | 
|  | } | 
|  | } | 
|  |  | 
|  | class := re.Rune | 
|  | first := true // ] and - are okay as first char in class | 
|  | for t == "" || t[0] != ']' || first { | 
|  | // POSIX: - is only okay unescaped as first or last in class. | 
|  | // Perl: - is okay anywhere. | 
|  | if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { | 
|  | _, size := utf8.DecodeRuneInString(t[1:]) | 
|  | return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} | 
|  | } | 
|  | first = false | 
|  |  | 
|  | // Look for POSIX [:alnum:] etc. | 
|  | if len(t) > 2 && t[0] == '[' && t[1] == ':' { | 
|  | nclass, nt, err := p.parseNamedClass(t, class) | 
|  | if err != nil { | 
|  | return "", err | 
|  | } | 
|  | if nclass != nil { | 
|  | class, t = nclass, nt | 
|  | continue | 
|  | } | 
|  | } | 
|  |  | 
|  | // Look for Unicode character group like \p{Han}. | 
|  | nclass, nt, err := p.parseUnicodeClass(t, class) | 
|  | if err != nil { | 
|  | return "", err | 
|  | } | 
|  | if nclass != nil { | 
|  | class, t = nclass, nt | 
|  | continue | 
|  | } | 
|  |  | 
|  | // Look for Perl character class symbols (extension). | 
|  | if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { | 
|  | class, t = nclass, nt | 
|  | continue | 
|  | } | 
|  |  | 
|  | // Single character or simple range. | 
|  | rng := t | 
|  | var lo, hi rune | 
|  | if lo, t, err = p.parseClassChar(t, s); err != nil { | 
|  | return "", err | 
|  | } | 
|  | hi = lo | 
|  | // [a-] means (a|-) so check for final ]. | 
|  | if len(t) >= 2 && t[0] == '-' && t[1] != ']' { | 
|  | t = t[1:] | 
|  | if hi, t, err = p.parseClassChar(t, s); err != nil { | 
|  | return "", err | 
|  | } | 
|  | if hi < lo { | 
|  | rng = rng[:len(rng)-len(t)] | 
|  | return "", &Error{Code: ErrInvalidCharRange, Expr: rng} | 
|  | } | 
|  | } | 
|  | if p.flags&FoldCase == 0 { | 
|  | class = appendRange(class, lo, hi) | 
|  | } else { | 
|  | class = appendFoldedRange(class, lo, hi) | 
|  | } | 
|  | } | 
|  | t = t[1:] // chop ] | 
|  |  | 
|  | // Use &re.Rune instead of &class to avoid allocation. | 
|  | re.Rune = class | 
|  | class = cleanClass(&re.Rune) | 
|  | if sign < 0 { | 
|  | class = negateClass(class) | 
|  | } | 
|  | re.Rune = class | 
|  | p.push(re) | 
|  | return t, nil | 
|  | } | 
|  |  | 
|  | // cleanClass sorts the ranges (pairs of elements of r), | 
|  | // merges them, and eliminates duplicates. | 
|  | func cleanClass(rp *[]rune) []rune { | 
|  |  | 
|  | // Sort by lo increasing, hi decreasing to break ties. | 
|  | sort.Sort(ranges{rp}) | 
|  |  | 
|  | r := *rp | 
|  | if len(r) < 2 { | 
|  | return r | 
|  | } | 
|  |  | 
|  | // Merge abutting, overlapping. | 
|  | w := 2 // write index | 
|  | for i := 2; i < len(r); i += 2 { | 
|  | lo, hi := r[i], r[i+1] | 
|  | if lo <= r[w-1]+1 { | 
|  | // merge with previous range | 
|  | if hi > r[w-1] { | 
|  | r[w-1] = hi | 
|  | } | 
|  | continue | 
|  | } | 
|  | // new disjoint range | 
|  | r[w] = lo | 
|  | r[w+1] = hi | 
|  | w += 2 | 
|  | } | 
|  |  | 
|  | return r[:w] | 
|  | } | 
|  |  | 
|  | // inCharClass reports whether r is in the class. | 
|  | // It assumes the class has been cleaned by cleanClass. | 
|  | func inCharClass(r rune, class []rune) bool { | 
|  | _, ok := sort.Find(len(class)/2, func(i int) int { | 
|  | lo, hi := class[2*i], class[2*i+1] | 
|  | if r > hi { | 
|  | return +1 | 
|  | } | 
|  | if r < lo { | 
|  | return -1 | 
|  | } | 
|  | return 0 | 
|  | }) | 
|  | return ok | 
|  | } | 
|  |  | 
|  | // appendLiteral returns the result of appending the literal x to the class r. | 
|  | func appendLiteral(r []rune, x rune, flags Flags) []rune { | 
|  | if flags&FoldCase != 0 { | 
|  | return appendFoldedRange(r, x, x) | 
|  | } | 
|  | return appendRange(r, x, x) | 
|  | } | 
|  |  | 
|  | // appendRange returns the result of appending the range lo-hi to the class r. | 
|  | func appendRange(r []rune, lo, hi rune) []rune { | 
|  | // Expand last range or next to last range if it overlaps or abuts. | 
|  | // Checking two ranges helps when appending case-folded | 
|  | // alphabets, so that one range can be expanding A-Z and the | 
|  | // other expanding a-z. | 
|  | n := len(r) | 
|  | for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 | 
|  | if n >= i { | 
|  | rlo, rhi := r[n-i], r[n-i+1] | 
|  | if lo <= rhi+1 && rlo <= hi+1 { | 
|  | if lo < rlo { | 
|  | r[n-i] = lo | 
|  | } | 
|  | if hi > rhi { | 
|  | r[n-i+1] = hi | 
|  | } | 
|  | return r | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return append(r, lo, hi) | 
|  | } | 
|  |  | 
|  | const ( | 
|  | // minimum and maximum runes involved in folding. | 
|  | // checked during test. | 
|  | minFold = 0x0041 | 
|  | maxFold = 0x1e943 | 
|  | ) | 
|  |  | 
|  | // appendFoldedRange returns the result of appending the range lo-hi | 
|  | // and its case folding-equivalent runes to the class r. | 
|  | func appendFoldedRange(r []rune, lo, hi rune) []rune { | 
|  | // Optimizations. | 
|  | if lo <= minFold && hi >= maxFold { | 
|  | // Range is full: folding can't add more. | 
|  | return appendRange(r, lo, hi) | 
|  | } | 
|  | if hi < minFold || lo > maxFold { | 
|  | // Range is outside folding possibilities. | 
|  | return appendRange(r, lo, hi) | 
|  | } | 
|  | if lo < minFold { | 
|  | // [lo, minFold-1] needs no folding. | 
|  | r = appendRange(r, lo, minFold-1) | 
|  | lo = minFold | 
|  | } | 
|  | if hi > maxFold { | 
|  | // [maxFold+1, hi] needs no folding. | 
|  | r = appendRange(r, maxFold+1, hi) | 
|  | hi = maxFold | 
|  | } | 
|  |  | 
|  | // Brute force. Depend on appendRange to coalesce ranges on the fly. | 
|  | for c := lo; c <= hi; c++ { | 
|  | r = appendRange(r, c, c) | 
|  | f := unicode.SimpleFold(c) | 
|  | for f != c { | 
|  | r = appendRange(r, f, f) | 
|  | f = unicode.SimpleFold(f) | 
|  | } | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // appendClass returns the result of appending the class x to the class r. | 
|  | // It assume x is clean. | 
|  | func appendClass(r []rune, x []rune) []rune { | 
|  | for i := 0; i < len(x); i += 2 { | 
|  | r = appendRange(r, x[i], x[i+1]) | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // appendFoldedClass returns the result of appending the case folding of the class x to the class r. | 
|  | func appendFoldedClass(r []rune, x []rune) []rune { | 
|  | for i := 0; i < len(x); i += 2 { | 
|  | r = appendFoldedRange(r, x[i], x[i+1]) | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // appendNegatedClass returns the result of appending the negation of the class x to the class r. | 
|  | // It assumes x is clean. | 
|  | func appendNegatedClass(r []rune, x []rune) []rune { | 
|  | nextLo := '\u0000' | 
|  | for i := 0; i < len(x); i += 2 { | 
|  | lo, hi := x[i], x[i+1] | 
|  | if nextLo <= lo-1 { | 
|  | r = appendRange(r, nextLo, lo-1) | 
|  | } | 
|  | nextLo = hi + 1 | 
|  | } | 
|  | if nextLo <= unicode.MaxRune { | 
|  | r = appendRange(r, nextLo, unicode.MaxRune) | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // appendTable returns the result of appending x to the class r. | 
|  | func appendTable(r []rune, x *unicode.RangeTable) []rune { | 
|  | for _, xr := range x.R16 { | 
|  | lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) | 
|  | if stride == 1 { | 
|  | r = appendRange(r, lo, hi) | 
|  | continue | 
|  | } | 
|  | for c := lo; c <= hi; c += stride { | 
|  | r = appendRange(r, c, c) | 
|  | } | 
|  | } | 
|  | for _, xr := range x.R32 { | 
|  | lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) | 
|  | if stride == 1 { | 
|  | r = appendRange(r, lo, hi) | 
|  | continue | 
|  | } | 
|  | for c := lo; c <= hi; c += stride { | 
|  | r = appendRange(r, c, c) | 
|  | } | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // appendNegatedTable returns the result of appending the negation of x to the class r. | 
|  | func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { | 
|  | nextLo := '\u0000' // lo end of next class to add | 
|  | for _, xr := range x.R16 { | 
|  | lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) | 
|  | if stride == 1 { | 
|  | if nextLo <= lo-1 { | 
|  | r = appendRange(r, nextLo, lo-1) | 
|  | } | 
|  | nextLo = hi + 1 | 
|  | continue | 
|  | } | 
|  | for c := lo; c <= hi; c += stride { | 
|  | if nextLo <= c-1 { | 
|  | r = appendRange(r, nextLo, c-1) | 
|  | } | 
|  | nextLo = c + 1 | 
|  | } | 
|  | } | 
|  | for _, xr := range x.R32 { | 
|  | lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) | 
|  | if stride == 1 { | 
|  | if nextLo <= lo-1 { | 
|  | r = appendRange(r, nextLo, lo-1) | 
|  | } | 
|  | nextLo = hi + 1 | 
|  | continue | 
|  | } | 
|  | for c := lo; c <= hi; c += stride { | 
|  | if nextLo <= c-1 { | 
|  | r = appendRange(r, nextLo, c-1) | 
|  | } | 
|  | nextLo = c + 1 | 
|  | } | 
|  | } | 
|  | if nextLo <= unicode.MaxRune { | 
|  | r = appendRange(r, nextLo, unicode.MaxRune) | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // negateClass overwrites r and returns r's negation. | 
|  | // It assumes the class r is already clean. | 
|  | func negateClass(r []rune) []rune { | 
|  | nextLo := '\u0000' // lo end of next class to add | 
|  | w := 0             // write index | 
|  | for i := 0; i < len(r); i += 2 { | 
|  | lo, hi := r[i], r[i+1] | 
|  | if nextLo <= lo-1 { | 
|  | r[w] = nextLo | 
|  | r[w+1] = lo - 1 | 
|  | w += 2 | 
|  | } | 
|  | nextLo = hi + 1 | 
|  | } | 
|  | r = r[:w] | 
|  | if nextLo <= unicode.MaxRune { | 
|  | // It's possible for the negation to have one more | 
|  | // range - this one - than the original class, so use append. | 
|  | r = append(r, nextLo, unicode.MaxRune) | 
|  | } | 
|  | return r | 
|  | } | 
|  |  | 
|  | // ranges implements sort.Interface on a []rune. | 
|  | // The choice of receiver type definition is strange | 
|  | // but avoids an allocation since we already have | 
|  | // a *[]rune. | 
|  | type ranges struct { | 
|  | p *[]rune | 
|  | } | 
|  |  | 
|  | func (ra ranges) Less(i, j int) bool { | 
|  | p := *ra.p | 
|  | i *= 2 | 
|  | j *= 2 | 
|  | return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] | 
|  | } | 
|  |  | 
|  | func (ra ranges) Len() int { | 
|  | return len(*ra.p) / 2 | 
|  | } | 
|  |  | 
|  | func (ra ranges) Swap(i, j int) { | 
|  | p := *ra.p | 
|  | i *= 2 | 
|  | j *= 2 | 
|  | p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] | 
|  | } | 
|  |  | 
|  | func checkUTF8(s string) error { | 
|  | for s != "" { | 
|  | rune, size := utf8.DecodeRuneInString(s) | 
|  | if rune == utf8.RuneError && size == 1 { | 
|  | return &Error{Code: ErrInvalidUTF8, Expr: s} | 
|  | } | 
|  | s = s[size:] | 
|  | } | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func nextRune(s string) (c rune, t string, err error) { | 
|  | c, size := utf8.DecodeRuneInString(s) | 
|  | if c == utf8.RuneError && size == 1 { | 
|  | return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} | 
|  | } | 
|  | return c, s[size:], nil | 
|  | } | 
|  |  | 
|  | func isalnum(c rune) bool { | 
|  | return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' | 
|  | } | 
|  |  | 
|  | func unhex(c rune) rune { | 
|  | if '0' <= c && c <= '9' { | 
|  | return c - '0' | 
|  | } | 
|  | if 'a' <= c && c <= 'f' { | 
|  | return c - 'a' + 10 | 
|  | } | 
|  | if 'A' <= c && c <= 'F' { | 
|  | return c - 'A' + 10 | 
|  | } | 
|  | return -1 | 
|  | } |