| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package syntax |
| |
| // Note to implementers: |
| // In this package, re is always a *Regexp and r is always a rune. |
| |
| import ( |
| "strconv" |
| "strings" |
| "unicode" |
| ) |
| |
| // A Regexp is a node in a regular expression syntax tree. |
| type Regexp struct { |
| Op Op // operator |
| Flags Flags |
| Sub []*Regexp // subexpressions, if any |
| Sub0 [1]*Regexp // storage for short Sub |
| Rune []rune // matched runes, for OpLiteral, OpCharClass |
| Rune0 [2]rune // storage for short Rune |
| Min, Max int // min, max for OpRepeat |
| Cap int // capturing index, for OpCapture |
| Name string // capturing name, for OpCapture |
| } |
| |
| //go:generate stringer -type Op -trimprefix Op |
| |
| // An Op is a single regular expression operator. |
| type Op uint8 |
| |
| // Operators are listed in precedence order, tightest binding to weakest. |
| // Character class operators are listed simplest to most complex |
| // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). |
| |
| const ( |
| OpNoMatch Op = 1 + iota // matches no strings |
| OpEmptyMatch // matches empty string |
| OpLiteral // matches Runes sequence |
| OpCharClass // matches Runes interpreted as range pair list |
| OpAnyCharNotNL // matches any character except newline |
| OpAnyChar // matches any character |
| OpBeginLine // matches empty string at beginning of line |
| OpEndLine // matches empty string at end of line |
| OpBeginText // matches empty string at beginning of text |
| OpEndText // matches empty string at end of text |
| OpWordBoundary // matches word boundary `\b` |
| OpNoWordBoundary // matches word non-boundary `\B` |
| OpCapture // capturing subexpression with index Cap, optional name Name |
| OpStar // matches Sub[0] zero or more times |
| OpPlus // matches Sub[0] one or more times |
| OpQuest // matches Sub[0] zero or one times |
| OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) |
| OpConcat // matches concatenation of Subs |
| OpAlternate // matches alternation of Subs |
| ) |
| |
| const opPseudo Op = 128 // where pseudo-ops start |
| |
| // Equal reports whether x and y have identical structure. |
| func (x *Regexp) Equal(y *Regexp) bool { |
| if x == nil || y == nil { |
| return x == y |
| } |
| if x.Op != y.Op { |
| return false |
| } |
| switch x.Op { |
| case OpEndText: |
| // The parse flags remember whether this is \z or \Z. |
| if x.Flags&WasDollar != y.Flags&WasDollar { |
| return false |
| } |
| |
| case OpLiteral, OpCharClass: |
| if len(x.Rune) != len(y.Rune) { |
| return false |
| } |
| for i, r := range x.Rune { |
| if r != y.Rune[i] { |
| return false |
| } |
| } |
| |
| case OpAlternate, OpConcat: |
| if len(x.Sub) != len(y.Sub) { |
| return false |
| } |
| for i, sub := range x.Sub { |
| if !sub.Equal(y.Sub[i]) { |
| return false |
| } |
| } |
| |
| case OpStar, OpPlus, OpQuest: |
| if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| |
| case OpRepeat: |
| if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| |
| case OpCapture: |
| if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // writeRegexp writes the Perl syntax for the regular expression re to b. |
| func writeRegexp(b *strings.Builder, re *Regexp) { |
| switch re.Op { |
| default: |
| b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") |
| case OpNoMatch: |
| b.WriteString(`[^\x00-\x{10FFFF}]`) |
| case OpEmptyMatch: |
| b.WriteString(`(?:)`) |
| case OpLiteral: |
| if re.Flags&FoldCase != 0 { |
| b.WriteString(`(?i:`) |
| } |
| for _, r := range re.Rune { |
| escape(b, r, false) |
| } |
| if re.Flags&FoldCase != 0 { |
| b.WriteString(`)`) |
| } |
| case OpCharClass: |
| if len(re.Rune)%2 != 0 { |
| b.WriteString(`[invalid char class]`) |
| break |
| } |
| b.WriteRune('[') |
| if len(re.Rune) == 0 { |
| b.WriteString(`^\x00-\x{10FFFF}`) |
| } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { |
| // Contains 0 and MaxRune. Probably a negated class. |
| // Print the gaps. |
| b.WriteRune('^') |
| for i := 1; i < len(re.Rune)-1; i += 2 { |
| lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 |
| escape(b, lo, lo == '-') |
| if lo != hi { |
| b.WriteRune('-') |
| escape(b, hi, hi == '-') |
| } |
| } |
| } else { |
| for i := 0; i < len(re.Rune); i += 2 { |
| lo, hi := re.Rune[i], re.Rune[i+1] |
| escape(b, lo, lo == '-') |
| if lo != hi { |
| b.WriteRune('-') |
| escape(b, hi, hi == '-') |
| } |
| } |
| } |
| b.WriteRune(']') |
| case OpAnyCharNotNL: |
| b.WriteString(`(?-s:.)`) |
| case OpAnyChar: |
| b.WriteString(`(?s:.)`) |
| case OpBeginLine: |
| b.WriteString(`(?m:^)`) |
| case OpEndLine: |
| b.WriteString(`(?m:$)`) |
| case OpBeginText: |
| b.WriteString(`\A`) |
| case OpEndText: |
| if re.Flags&WasDollar != 0 { |
| b.WriteString(`(?-m:$)`) |
| } else { |
| b.WriteString(`\z`) |
| } |
| case OpWordBoundary: |
| b.WriteString(`\b`) |
| case OpNoWordBoundary: |
| b.WriteString(`\B`) |
| case OpCapture: |
| if re.Name != "" { |
| b.WriteString(`(?P<`) |
| b.WriteString(re.Name) |
| b.WriteRune('>') |
| } else { |
| b.WriteRune('(') |
| } |
| if re.Sub[0].Op != OpEmptyMatch { |
| writeRegexp(b, re.Sub[0]) |
| } |
| b.WriteRune(')') |
| case OpStar, OpPlus, OpQuest, OpRepeat: |
| if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { |
| b.WriteString(`(?:`) |
| writeRegexp(b, sub) |
| b.WriteString(`)`) |
| } else { |
| writeRegexp(b, sub) |
| } |
| switch re.Op { |
| case OpStar: |
| b.WriteRune('*') |
| case OpPlus: |
| b.WriteRune('+') |
| case OpQuest: |
| b.WriteRune('?') |
| case OpRepeat: |
| b.WriteRune('{') |
| b.WriteString(strconv.Itoa(re.Min)) |
| if re.Max != re.Min { |
| b.WriteRune(',') |
| if re.Max >= 0 { |
| b.WriteString(strconv.Itoa(re.Max)) |
| } |
| } |
| b.WriteRune('}') |
| } |
| if re.Flags&NonGreedy != 0 { |
| b.WriteRune('?') |
| } |
| case OpConcat: |
| for _, sub := range re.Sub { |
| if sub.Op == OpAlternate { |
| b.WriteString(`(?:`) |
| writeRegexp(b, sub) |
| b.WriteString(`)`) |
| } else { |
| writeRegexp(b, sub) |
| } |
| } |
| case OpAlternate: |
| for i, sub := range re.Sub { |
| if i > 0 { |
| b.WriteRune('|') |
| } |
| writeRegexp(b, sub) |
| } |
| } |
| } |
| |
| func (re *Regexp) String() string { |
| var b strings.Builder |
| writeRegexp(&b, re) |
| return b.String() |
| } |
| |
| const meta = `\.+*?()|[]{}^$` |
| |
| func escape(b *strings.Builder, r rune, force bool) { |
| if unicode.IsPrint(r) { |
| if strings.ContainsRune(meta, r) || force { |
| b.WriteRune('\\') |
| } |
| b.WriteRune(r) |
| return |
| } |
| |
| switch r { |
| case '\a': |
| b.WriteString(`\a`) |
| case '\f': |
| b.WriteString(`\f`) |
| case '\n': |
| b.WriteString(`\n`) |
| case '\r': |
| b.WriteString(`\r`) |
| case '\t': |
| b.WriteString(`\t`) |
| case '\v': |
| b.WriteString(`\v`) |
| default: |
| if r < 0x100 { |
| b.WriteString(`\x`) |
| s := strconv.FormatInt(int64(r), 16) |
| if len(s) == 1 { |
| b.WriteRune('0') |
| } |
| b.WriteString(s) |
| break |
| } |
| b.WriteString(`\x{`) |
| b.WriteString(strconv.FormatInt(int64(r), 16)) |
| b.WriteString(`}`) |
| } |
| } |
| |
| // MaxCap walks the regexp to find the maximum capture index. |
| func (re *Regexp) MaxCap() int { |
| m := 0 |
| if re.Op == OpCapture { |
| m = re.Cap |
| } |
| for _, sub := range re.Sub { |
| if n := sub.MaxCap(); m < n { |
| m = n |
| } |
| } |
| return m |
| } |
| |
| // CapNames walks the regexp to find the names of capturing groups. |
| func (re *Regexp) CapNames() []string { |
| names := make([]string, re.MaxCap()+1) |
| re.capNames(names) |
| return names |
| } |
| |
| func (re *Regexp) capNames(names []string) { |
| if re.Op == OpCapture { |
| names[re.Cap] = re.Name |
| } |
| for _, sub := range re.Sub { |
| sub.capNames(names) |
| } |
| } |