Brad Fitzpatrick | 5194744 | 2016-03-01 22:57:46 +0000 | [diff] [blame] | 1 | // Copyright 2011 The Go Authors. All rights reserved. |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package syntax |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "fmt" |
| 10 | "testing" |
| 11 | "unicode" |
| 12 | ) |
| 13 | |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 14 | type parseTest struct { |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 15 | Regexp string |
| 16 | Dump string |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 17 | } |
| 18 | |
| 19 | var parseTests = []parseTest{ |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 20 | // Base cases |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 21 | {`a`, `lit{a}`}, |
| 22 | {`a.`, `cat{lit{a}dot{}}`}, |
| 23 | {`a.b`, `cat{lit{a}dot{}lit{b}}`}, |
| 24 | {`ab`, `str{ab}`}, |
| 25 | {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, |
| 26 | {`abc`, `str{abc}`}, |
| 27 | {`a|^`, `alt{lit{a}bol{}}`}, |
| 28 | {`a|b`, `cc{0x61-0x62}`}, |
| 29 | {`(a)`, `cap{lit{a}}`}, |
| 30 | {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, |
| 31 | {`a*`, `star{lit{a}}`}, |
| 32 | {`a+`, `plus{lit{a}}`}, |
| 33 | {`a?`, `que{lit{a}}`}, |
| 34 | {`a{2}`, `rep{2,2 lit{a}}`}, |
| 35 | {`a{2,3}`, `rep{2,3 lit{a}}`}, |
| 36 | {`a{2,}`, `rep{2,-1 lit{a}}`}, |
| 37 | {`a*?`, `nstar{lit{a}}`}, |
| 38 | {`a+?`, `nplus{lit{a}}`}, |
| 39 | {`a??`, `nque{lit{a}}`}, |
| 40 | {`a{2}?`, `nrep{2,2 lit{a}}`}, |
| 41 | {`a{2,3}?`, `nrep{2,3 lit{a}}`}, |
| 42 | {`a{2,}?`, `nrep{2,-1 lit{a}}`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 43 | // Malformed { } are treated as literals. |
| 44 | {`x{1001`, `str{x{1001}`}, |
| 45 | {`x{9876543210`, `str{x{9876543210}`}, |
| 46 | {`x{9876543210,`, `str{x{9876543210,}`}, |
| 47 | {`x{2,1`, `str{x{2,1}`}, |
| 48 | {`x{1,9876543210`, `str{x{1,9876543210}`}, |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 49 | {``, `emp{}`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 50 | {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 51 | {`|x|`, `alt{emp{}lit{x}emp{}}`}, |
| 52 | {`.`, `dot{}`}, |
| 53 | {`^`, `bol{}`}, |
| 54 | {`$`, `eol{}`}, |
| 55 | {`\|`, `lit{|}`}, |
| 56 | {`\(`, `lit{(}`}, |
| 57 | {`\)`, `lit{)}`}, |
| 58 | {`\*`, `lit{*}`}, |
| 59 | {`\+`, `lit{+}`}, |
| 60 | {`\?`, `lit{?}`}, |
| 61 | {`{`, `lit{{}`}, |
| 62 | {`}`, `lit{}}`}, |
| 63 | {`\.`, `lit{.}`}, |
| 64 | {`\^`, `lit{^}`}, |
| 65 | {`\$`, `lit{$}`}, |
| 66 | {`\\`, `lit{\}`}, |
| 67 | {`[ace]`, `cc{0x61 0x63 0x65}`}, |
| 68 | {`[abc]`, `cc{0x61-0x63}`}, |
| 69 | {`[a-z]`, `cc{0x61-0x7a}`}, |
| 70 | {`[a]`, `lit{a}`}, |
| 71 | {`\-`, `lit{-}`}, |
| 72 | {`-`, `lit{-}`}, |
| 73 | {`\_`, `lit{_}`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 74 | {`abc`, `str{abc}`}, |
| 75 | {`abc|def`, `alt{str{abc}str{def}}`}, |
| 76 | {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 77 | |
| 78 | // Posix and Perl extensions |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 79 | {`[[:lower:]]`, `cc{0x61-0x7a}`}, |
| 80 | {`[a-z]`, `cc{0x61-0x7a}`}, |
| 81 | {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, |
| 82 | {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, |
| 83 | {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, |
| 84 | {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, |
| 85 | {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, |
| 86 | {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, |
| 87 | {`\d`, `cc{0x30-0x39}`}, |
| 88 | {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, |
| 89 | {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, |
| 90 | {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, |
| 91 | {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, |
| 92 | {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, |
| 93 | {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, |
| 94 | {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, |
| 95 | {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, |
| 96 | // { `\C`, `byte{}` }, // probably never |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 97 | |
| 98 | // Unicode, negatives, and a double negative. |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 99 | {`\p{Braille}`, `cc{0x2800-0x28ff}`}, |
| 100 | {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, |
| 101 | {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, |
| 102 | {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, |
Marcel van Lohuizen | 746d636 | 2014-02-18 20:12:59 +0100 | [diff] [blame] | 103 | {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 104 | {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, |
| 105 | {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, |
| 106 | {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, |
| 107 | {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, |
Marcel van Lohuizen | 746d636 | 2014-02-18 20:12:59 +0100 | [diff] [blame] | 108 | {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 109 | {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, |
| 110 | {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, |
| 111 | {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 112 | {`\p{Any}`, `dot{}`}, |
| 113 | {`\p{^Any}`, `cc{}`}, |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 114 | |
| 115 | // Hex, octal. |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 116 | {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, |
| 117 | {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 118 | |
| 119 | // More interesting regular expressions. |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 120 | {`a{,2}`, `str{a{,2}}`}, |
| 121 | {`\.\^\$\\`, `str{.^$\}`}, |
| 122 | {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, |
| 123 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, |
| 124 | {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 |
| 125 | {`a*{`, `cat{star{lit{a}}lit{{}}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 126 | |
| 127 | // Test precedences |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 128 | {`(?:ab)*`, `star{str{ab}}`}, |
| 129 | {`(ab)*`, `star{cap{str{ab}}}`}, |
| 130 | {`ab|cd`, `alt{str{ab}str{cd}}`}, |
| 131 | {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 132 | |
| 133 | // Test flattening. |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 134 | {`(?:a)`, `lit{a}`}, |
| 135 | {`(?:ab)(?:cd)`, `str{abcd}`}, |
| 136 | {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, |
| 137 | {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, |
| 138 | {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, |
| 139 | {`a|.`, `dot{}`}, |
| 140 | {`.|a`, `dot{}`}, |
| 141 | {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, |
| 142 | {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 143 | |
| 144 | // Test Perl quoted literals |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 145 | {`\Q+|*?{[\E`, `str{+|*?{[}`}, |
| 146 | {`\Q+\E+`, `plus{lit{+}}`}, |
Russ Cox | 0680e9c | 2015-11-25 12:32:34 -0500 | [diff] [blame] | 147 | {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 148 | {`\Q\\E`, `lit{\}`}, |
| 149 | {`\Q\\\E`, `str{\\}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 150 | |
| 151 | // Test Perl \A and \z |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 152 | {`(?m)^`, `bol{}`}, |
| 153 | {`(?m)$`, `eol{}`}, |
| 154 | {`(?-m)^`, `bot{}`}, |
| 155 | {`(?-m)$`, `eot{}`}, |
| 156 | {`(?m)\A`, `bot{}`}, |
| 157 | {`(?m)\z`, `eot{\z}`}, |
| 158 | {`(?-m)\A`, `bot{}`}, |
| 159 | {`(?-m)\z`, `eot{\z}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 160 | |
| 161 | // Test named captures |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 162 | {`(?P<name>a)`, `cap{name:lit{a}}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 163 | |
| 164 | // Case-folded literals |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 165 | {`[Aa]`, `litfold{A}`}, |
| 166 | {`[\x{100}\x{101}]`, `litfold{Ā}`}, |
| 167 | {`[Δδ]`, `litfold{Δ}`}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 168 | |
| 169 | // Strings |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 170 | {`abcde`, `str{abcde}`}, |
| 171 | {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 172 | |
| 173 | // Factoring. |
| 174 | {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, |
Paul Wankadia | 5ccaf02 | 2016-01-07 18:55:18 +1100 | [diff] [blame] | 175 | {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 176 | |
| 177 | // Bug fixes. |
| 178 | {`(?:.)`, `dot{}`}, |
| 179 | {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, |
| 180 | {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, |
| 181 | {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, |
| 182 | {`(?:A|a)`, `litfold{A}`}, |
| 183 | {`A|(?:A|a)`, `litfold{A}`}, |
| 184 | {`(?s).`, `dot{}`}, |
| 185 | {`(?-s).`, `dnl{}`}, |
| 186 | {`(?:(?:^).)`, `cat{bol{}dot{}}`}, |
| 187 | {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 188 | |
| 189 | // RE2 prefix_tests |
| 190 | {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, |
| 191 | {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, |
| 192 | {`abc|abd|aef|bcx|bcy`, |
| 193 | `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + |
| 194 | `cat{str{bc}cc{0x78-0x79}}}`}, |
| 195 | {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, |
| 196 | {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, |
| 197 | {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, |
Paul Wankadia | 5ccaf02 | 2016-01-07 18:55:18 +1100 | [diff] [blame] | 198 | {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 199 | {`x{2}|x{2}[0-9]`, |
| 200 | `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, |
| 201 | {`x{2}y|x{2}[0-9]y`, |
| 202 | `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, |
Paul Wankadia | 5ccaf02 | 2016-01-07 18:55:18 +1100 | [diff] [blame] | 203 | {`a.*?c|a.*?b`, |
| 204 | `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, |
Russ Cox | 9b2b0c8 | 2014-09-30 12:08:09 -0400 | [diff] [blame] | 205 | |
| 206 | // Valid repetitions. |
| 207 | {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, |
| 208 | {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 209 | } |
| 210 | |
| 211 | const testFlags = MatchNL | PerlX | UnicodeGroups |
| 212 | |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 213 | func TestParseSimple(t *testing.T) { |
| 214 | testParseDump(t, parseTests, testFlags) |
| 215 | } |
| 216 | |
| 217 | var foldcaseTests = []parseTest{ |
| 218 | {`AbCdE`, `strfold{ABCDE}`}, |
| 219 | {`[Aa]`, `litfold{A}`}, |
| 220 | {`a`, `litfold{A}`}, |
| 221 | |
| 222 | // 0x17F is an old English long s (looks like an f) and folds to s. |
| 223 | // 0x212A is the Kelvin symbol and folds to k. |
| 224 | {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...] |
| 225 | {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, |
| 226 | {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, |
| 227 | } |
| 228 | |
| 229 | func TestParseFoldCase(t *testing.T) { |
| 230 | testParseDump(t, foldcaseTests, FoldCase) |
| 231 | } |
| 232 | |
| 233 | var literalTests = []parseTest{ |
| 234 | {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"}, |
| 235 | } |
| 236 | |
| 237 | func TestParseLiteral(t *testing.T) { |
| 238 | testParseDump(t, literalTests, Literal) |
| 239 | } |
| 240 | |
| 241 | var matchnlTests = []parseTest{ |
| 242 | {`.`, `dot{}`}, |
| 243 | {"\n", "lit{\n}"}, |
| 244 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, |
| 245 | {`[a\n]`, `cc{0xa 0x61}`}, |
| 246 | } |
| 247 | |
| 248 | func TestParseMatchNL(t *testing.T) { |
| 249 | testParseDump(t, matchnlTests, MatchNL) |
| 250 | } |
| 251 | |
| 252 | var nomatchnlTests = []parseTest{ |
| 253 | {`.`, `dnl{}`}, |
| 254 | {"\n", "lit{\n}"}, |
| 255 | {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`}, |
| 256 | {`[a\n]`, `cc{0xa 0x61}`}, |
| 257 | } |
| 258 | |
| 259 | func TestParseNoMatchNL(t *testing.T) { |
| 260 | testParseDump(t, nomatchnlTests, 0) |
| 261 | } |
| 262 | |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 263 | // Test Parse -> Dump. |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 264 | func testParseDump(t *testing.T, tests []parseTest, flags Flags) { |
| 265 | for _, tt := range tests { |
| 266 | re, err := Parse(tt.Regexp, flags) |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 267 | if err != nil { |
| 268 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) |
| 269 | continue |
| 270 | } |
Russ Cox | 9b2b0c8 | 2014-09-30 12:08:09 -0400 | [diff] [blame] | 271 | if tt.Dump == "" { |
| 272 | // It parsed. That's all we care about. |
| 273 | continue |
| 274 | } |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 275 | d := dump(re) |
| 276 | if d != tt.Dump { |
| 277 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) |
| 278 | } |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | // dump prints a string representation of the regexp showing |
| 283 | // the structure explicitly. |
| 284 | func dump(re *Regexp) string { |
| 285 | var b bytes.Buffer |
| 286 | dumpRegexp(&b, re) |
| 287 | return b.String() |
| 288 | } |
| 289 | |
| 290 | var opNames = []string{ |
| 291 | OpNoMatch: "no", |
| 292 | OpEmptyMatch: "emp", |
| 293 | OpLiteral: "lit", |
| 294 | OpCharClass: "cc", |
| 295 | OpAnyCharNotNL: "dnl", |
| 296 | OpAnyChar: "dot", |
| 297 | OpBeginLine: "bol", |
| 298 | OpEndLine: "eol", |
| 299 | OpBeginText: "bot", |
| 300 | OpEndText: "eot", |
| 301 | OpWordBoundary: "wb", |
| 302 | OpNoWordBoundary: "nwb", |
| 303 | OpCapture: "cap", |
| 304 | OpStar: "star", |
| 305 | OpPlus: "plus", |
| 306 | OpQuest: "que", |
| 307 | OpRepeat: "rep", |
| 308 | OpConcat: "cat", |
| 309 | OpAlternate: "alt", |
| 310 | } |
| 311 | |
| 312 | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. |
| 313 | // It is used during testing to distinguish between parses that might print |
| 314 | // the same using re's String method. |
| 315 | func dumpRegexp(b *bytes.Buffer, re *Regexp) { |
| 316 | if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { |
| 317 | fmt.Fprintf(b, "op%d", re.Op) |
| 318 | } else { |
| 319 | switch re.Op { |
| 320 | default: |
| 321 | b.WriteString(opNames[re.Op]) |
| 322 | case OpStar, OpPlus, OpQuest, OpRepeat: |
| 323 | if re.Flags&NonGreedy != 0 { |
| 324 | b.WriteByte('n') |
| 325 | } |
| 326 | b.WriteString(opNames[re.Op]) |
| 327 | case OpLiteral: |
| 328 | if len(re.Rune) > 1 { |
| 329 | b.WriteString("str") |
| 330 | } else { |
| 331 | b.WriteString("lit") |
| 332 | } |
| 333 | if re.Flags&FoldCase != 0 { |
| 334 | for _, r := range re.Rune { |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 335 | if unicode.SimpleFold(r) != r { |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 336 | b.WriteString("fold") |
Russ Cox | 7e1a3e9 | 2011-06-29 00:55:37 -0400 | [diff] [blame] | 337 | break |
Russ Cox | b96c347 | 2011-06-13 09:20:23 -0400 | [diff] [blame] | 338 | } |
| 339 | } |
| 340 | } |
| 341 | } |
| 342 | } |
| 343 | b.WriteByte('{') |
| 344 | switch re.Op { |
| 345 | case OpEndText: |
| 346 | if re.Flags&WasDollar == 0 { |
| 347 | b.WriteString(`\z`) |
| 348 | } |
| 349 | case OpLiteral: |
| 350 | for _, r := range re.Rune { |
| 351 | b.WriteRune(r) |
| 352 | } |
| 353 | case OpConcat, OpAlternate: |
| 354 | for _, sub := range re.Sub { |
| 355 | dumpRegexp(b, sub) |
| 356 | } |
| 357 | case OpStar, OpPlus, OpQuest: |
| 358 | dumpRegexp(b, re.Sub[0]) |
| 359 | case OpRepeat: |
| 360 | fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) |
| 361 | dumpRegexp(b, re.Sub[0]) |
| 362 | case OpCapture: |
| 363 | if re.Name != "" { |
| 364 | b.WriteString(re.Name) |
| 365 | b.WriteByte(':') |
| 366 | } |
| 367 | dumpRegexp(b, re.Sub[0]) |
| 368 | case OpCharClass: |
| 369 | sep := "" |
| 370 | for i := 0; i < len(re.Rune); i += 2 { |
| 371 | b.WriteString(sep) |
| 372 | sep = " " |
| 373 | lo, hi := re.Rune[i], re.Rune[i+1] |
| 374 | if lo == hi { |
| 375 | fmt.Fprintf(b, "%#x", lo) |
| 376 | } else { |
| 377 | fmt.Fprintf(b, "%#x-%#x", lo, hi) |
| 378 | } |
| 379 | } |
| 380 | } |
| 381 | b.WriteByte('}') |
| 382 | } |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 383 | |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 384 | func mkCharClass(f func(rune) bool) string { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 385 | re := &Regexp{Op: OpCharClass} |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 386 | lo := rune(-1) |
| 387 | for i := rune(0); i <= unicode.MaxRune; i++ { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 388 | if f(i) { |
| 389 | if lo < 0 { |
| 390 | lo = i |
| 391 | } |
| 392 | } else { |
| 393 | if lo >= 0 { |
| 394 | re.Rune = append(re.Rune, lo, i-1) |
| 395 | lo = -1 |
| 396 | } |
| 397 | } |
| 398 | } |
| 399 | if lo >= 0 { |
| 400 | re.Rune = append(re.Rune, lo, unicode.MaxRune) |
| 401 | } |
| 402 | return dump(re) |
| 403 | } |
| 404 | |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 405 | func isUpperFold(r rune) bool { |
| 406 | if unicode.IsUpper(r) { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 407 | return true |
| 408 | } |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 409 | c := unicode.SimpleFold(r) |
| 410 | for c != r { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 411 | if unicode.IsUpper(c) { |
| 412 | return true |
| 413 | } |
| 414 | c = unicode.SimpleFold(c) |
| 415 | } |
| 416 | return false |
| 417 | } |
| 418 | |
| 419 | func TestFoldConstants(t *testing.T) { |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 420 | last := rune(-1) |
| 421 | for i := rune(0); i <= unicode.MaxRune; i++ { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 422 | if unicode.SimpleFold(i) == i { |
| 423 | continue |
| 424 | } |
| 425 | if last == -1 && minFold != i { |
| 426 | t.Errorf("minFold=%#U should be %#U", minFold, i) |
| 427 | } |
| 428 | last = i |
| 429 | } |
| 430 | if maxFold != last { |
| 431 | t.Errorf("maxFold=%#U should be %#U", maxFold, last) |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | func TestAppendRangeCollapse(t *testing.T) { |
| 436 | // AppendRange should collapse each of the new ranges |
| 437 | // into the earlier ones (it looks back two ranges), so that |
| 438 | // the slice never grows very large. |
| 439 | // Note that we are not calling cleanClass. |
Russ Cox | 3e52dad | 2011-10-25 22:20:57 -0700 | [diff] [blame] | 440 | var r []rune |
| 441 | for i := rune('A'); i <= 'Z'; i++ { |
Russ Cox | 52cd055 | 2011-06-27 23:23:51 -0400 | [diff] [blame] | 442 | r = appendRange(r, i, i) |
| 443 | r = appendRange(r, i+'a'-'A', i+'a'-'A') |
| 444 | } |
| 445 | if string(r) != "AZaz" { |
| 446 | t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) |
| 447 | } |
| 448 | } |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 449 | |
| 450 | var invalidRegexps = []string{ |
| 451 | `(`, |
| 452 | `)`, |
| 453 | `(a`, |
Jan Mercl | bd13f6f | 2012-05-14 11:50:25 -0700 | [diff] [blame] | 454 | `a)`, |
| 455 | `(a))`, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 456 | `(a|b|`, |
Jan Mercl | bd13f6f | 2012-05-14 11:50:25 -0700 | [diff] [blame] | 457 | `a|b|)`, |
| 458 | `(a|b|))`, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 459 | `(a|b`, |
Jan Mercl | bd13f6f | 2012-05-14 11:50:25 -0700 | [diff] [blame] | 460 | `a|b)`, |
| 461 | `(a|b))`, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 462 | `[a-z`, |
| 463 | `([a-z)`, |
Jan Mercl | bd13f6f | 2012-05-14 11:50:25 -0700 | [diff] [blame] | 464 | `[a-z)`, |
| 465 | `([a-z]))`, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 466 | `x{1001}`, |
| 467 | `x{9876543210}`, |
| 468 | `x{2,1}`, |
| 469 | `x{1,9876543210}`, |
| 470 | "\xff", // Invalid UTF-8 |
| 471 | "[\xff]", |
| 472 | "[\\\xff]", |
| 473 | "\\\xff", |
| 474 | `(?P<name>a`, |
| 475 | `(?P<name>`, |
| 476 | `(?P<name`, |
| 477 | `(?P<x y>a)`, |
| 478 | `(?P<>a)`, |
| 479 | `[a-Z]`, |
| 480 | `(?i)[a-Z]`, |
| 481 | `a{100000}`, |
| 482 | `a{100000,}`, |
Russ Cox | 9b2b0c8 | 2014-09-30 12:08:09 -0400 | [diff] [blame] | 483 | "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", |
Russ Cox | 0680e9c | 2015-11-25 12:32:34 -0500 | [diff] [blame] | 484 | `\Q\E*`, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 485 | } |
| 486 | |
| 487 | var onlyPerl = []string{ |
| 488 | `[a-b-c]`, |
| 489 | `\Qabc\E`, |
| 490 | `\Q*+?{[\E`, |
| 491 | `\Q\\E`, |
| 492 | `\Q\\\E`, |
| 493 | `\Q\\\\E`, |
| 494 | `\Q\\\\\E`, |
| 495 | `(?:a)`, |
| 496 | `(?P<name>a)`, |
| 497 | } |
| 498 | |
| 499 | var onlyPOSIX = []string{ |
| 500 | "a++", |
| 501 | "a**", |
| 502 | "a?*", |
| 503 | "a+*", |
| 504 | "a{1}*", |
Russ Cox | 9c6265d | 2011-09-12 14:03:53 -0400 | [diff] [blame] | 505 | ".{1}{2}.{3}", |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 506 | } |
| 507 | |
| 508 | func TestParseInvalidRegexps(t *testing.T) { |
| 509 | for _, regexp := range invalidRegexps { |
| 510 | if re, err := Parse(regexp, Perl); err == nil { |
| 511 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) |
| 512 | } |
| 513 | if re, err := Parse(regexp, POSIX); err == nil { |
| 514 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) |
| 515 | } |
| 516 | } |
| 517 | for _, regexp := range onlyPerl { |
| 518 | if _, err := Parse(regexp, Perl); err != nil { |
| 519 | t.Errorf("Parse(%#q, Perl): %v", regexp, err) |
| 520 | } |
| 521 | if re, err := Parse(regexp, POSIX); err == nil { |
| 522 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) |
| 523 | } |
| 524 | } |
| 525 | for _, regexp := range onlyPOSIX { |
| 526 | if re, err := Parse(regexp, Perl); err == nil { |
| 527 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) |
| 528 | } |
| 529 | if _, err := Parse(regexp, POSIX); err != nil { |
| 530 | t.Errorf("Parse(%#q, POSIX): %v", regexp, err) |
| 531 | } |
| 532 | } |
| 533 | } |
| 534 | |
| 535 | func TestToStringEquivalentParse(t *testing.T) { |
| 536 | for _, tt := range parseTests { |
| 537 | re, err := Parse(tt.Regexp, testFlags) |
| 538 | if err != nil { |
| 539 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) |
| 540 | continue |
| 541 | } |
Russ Cox | 9b2b0c8 | 2014-09-30 12:08:09 -0400 | [diff] [blame] | 542 | if tt.Dump == "" { |
| 543 | // It parsed. That's all we care about. |
| 544 | continue |
| 545 | } |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 546 | d := dump(re) |
| 547 | if d != tt.Dump { |
| 548 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) |
| 549 | continue |
| 550 | } |
| 551 | |
| 552 | s := re.String() |
| 553 | if s != tt.Regexp { |
| 554 | // If ToString didn't return the original regexp, |
| 555 | // it must have found one with fewer parens. |
| 556 | // Unfortunately we can't check the length here, because |
| 557 | // ToString produces "\\{" for a literal brace, |
| 558 | // but "{" is a shorter equivalent in some contexts. |
| 559 | nre, err := Parse(s, testFlags) |
| 560 | if err != nil { |
Rob Pike | fa7791e | 2013-09-27 10:09:15 +1000 | [diff] [blame] | 561 | t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 562 | continue |
| 563 | } |
| 564 | nd := dump(nre) |
| 565 | if d != nd { |
| 566 | t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) |
| 567 | } |
| 568 | |
| 569 | ns := nre.String() |
| 570 | if s != ns { |
| 571 | t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) |
| 572 | } |
| 573 | } |
| 574 | } |
| 575 | } |