Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package regexp |
| 6 | |
| 7 | import ( |
| 8 | "bufio" |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 9 | "compress/bzip2" |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 10 | "fmt" |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 11 | "io" |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 12 | "os" |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 13 | "path/filepath" |
Russ Cox | 6c230fb | 2011-09-26 18:33:13 -0400 | [diff] [blame] | 14 | "regexp/syntax" |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 15 | "strconv" |
| 16 | "strings" |
| 17 | "testing" |
Rob Pike | 45e3bcb | 2011-11-08 15:41:54 -0800 | [diff] [blame] | 18 | "unicode/utf8" |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 19 | ) |
| 20 | |
| 21 | // TestRE2 tests this package's regexp API against test cases |
| 22 | // considered during RE2's exhaustive tests, which run all possible |
| 23 | // regexps over a given set of atoms and operators, up to a given |
| 24 | // complexity, over all possible strings over a given alphabet, |
| 25 | // up to a given size. Rather than try to link with RE2, we read a |
| 26 | // log file containing the test cases and the expected matches. |
Dmitry Savintsev | 8cb9c21 | 2015-04-27 11:57:21 +0200 | [diff] [blame] | 27 | // The log file, re2-exhaustive.txt, is generated by running 'make log' |
| 28 | // in the open source RE2 distribution https://github.com/google/re2/. |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 29 | // |
| 30 | // The test file format is a sequence of stanzas like: |
| 31 | // |
| 32 | // strings |
| 33 | // "abc" |
| 34 | // "123x" |
| 35 | // regexps |
| 36 | // "[a-z]+" |
| 37 | // 0-3;0-3 |
| 38 | // -;- |
| 39 | // "([0-9])([0-9])([0-9])" |
| 40 | // -;- |
| 41 | // -;0-3 0-1 1-2 2-3 |
| 42 | // |
| 43 | // The stanza begins by defining a set of strings, quoted |
| 44 | // using Go double-quote syntax, one per line. Then the |
| 45 | // regexps section gives a sequence of regexps to run on |
| 46 | // the strings. In the block that follows a regexp, each line |
| 47 | // gives the semicolon-separated match results of running |
| 48 | // the regexp on the corresponding string. |
| 49 | // Each match result is either a single -, meaning no match, or a |
| 50 | // space-separated sequence of pairs giving the match and |
| 51 | // submatch indices. An unmatched subexpression formats |
| 52 | // its pair as a single - (not illustrated above). For now |
| 53 | // each regexp run produces two match results, one for a |
| 54 | // ``full match'' that restricts the regexp to matching the entire |
| 55 | // string or nothing, and one for a ``partial match'' that gives |
| 56 | // the leftmost first match found in the string. |
| 57 | // |
| 58 | // Lines beginning with # are comments. Lines beginning with |
| 59 | // a capital letter are test names printed during RE2's test suite |
| 60 | // and are echoed into t but otherwise ignored. |
| 61 | // |
Dmitry Savintsev | 8cb9c21 | 2015-04-27 11:57:21 +0200 | [diff] [blame] | 62 | // At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB, |
| 63 | // so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly. |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 64 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 65 | func TestRE2Search(t *testing.T) { |
| 66 | testRE2(t, "testdata/re2-search.txt") |
| 67 | } |
| 68 | |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 69 | func testRE2(t *testing.T, file string) { |
| 70 | f, err := os.Open(file) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 71 | if err != nil { |
| 72 | t.Fatal(err) |
| 73 | } |
| 74 | defer f.Close() |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 75 | var txt io.Reader |
| 76 | if strings.HasSuffix(file, ".bz2") { |
| 77 | z := bzip2.NewReader(f) |
| 78 | txt = z |
| 79 | file = file[:len(file)-len(".bz2")] // for error messages |
| 80 | } else { |
| 81 | txt = f |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 82 | } |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 83 | lineno := 0 |
Rob Pike | f913830 | 2013-02-20 13:37:45 -0800 | [diff] [blame] | 84 | scanner := bufio.NewScanner(txt) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 85 | var ( |
| 86 | str []string |
| 87 | input []string |
| 88 | inStrings bool |
| 89 | re *Regexp |
| 90 | refull *Regexp |
| 91 | nfail int |
| 92 | ncase int |
| 93 | ) |
Rob Pike | f913830 | 2013-02-20 13:37:45 -0800 | [diff] [blame] | 94 | for lineno := 1; scanner.Scan(); lineno++ { |
| 95 | line := scanner.Text() |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 96 | switch { |
| 97 | case line == "": |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 98 | t.Fatalf("%s:%d: unexpected blank line", file, lineno) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 99 | case line[0] == '#': |
| 100 | continue |
| 101 | case 'A' <= line[0] && line[0] <= 'Z': |
| 102 | // Test name. |
| 103 | t.Logf("%s\n", line) |
| 104 | continue |
| 105 | case line == "strings": |
| 106 | str = str[:0] |
| 107 | inStrings = true |
| 108 | case line == "regexps": |
| 109 | inStrings = false |
| 110 | case line[0] == '"': |
| 111 | q, err := strconv.Unquote(line) |
| 112 | if err != nil { |
| 113 | // Fatal because we'll get out of sync. |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 114 | t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 115 | } |
| 116 | if inStrings { |
| 117 | str = append(str, q) |
| 118 | continue |
| 119 | } |
| 120 | // Is a regexp. |
| 121 | if len(input) != 0 { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 122 | t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 123 | } |
| 124 | re, err = tryCompile(q) |
| 125 | if err != nil { |
Russ Cox | eb69292 | 2011-11-01 22:05:34 -0400 | [diff] [blame] | 126 | if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 127 | // We don't and likely never will support \C; keep going. |
| 128 | continue |
| 129 | } |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 130 | t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 131 | if nfail++; nfail >= 100 { |
| 132 | t.Fatalf("stopping after %d errors", nfail) |
| 133 | } |
| 134 | continue |
| 135 | } |
| 136 | full := `\A(?:` + q + `)\z` |
| 137 | refull, err = tryCompile(full) |
| 138 | if err != nil { |
| 139 | // Fatal because q worked, so this should always work. |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 140 | t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 141 | } |
| 142 | input = str |
| 143 | case line[0] == '-' || '0' <= line[0] && line[0] <= '9': |
| 144 | // A sequence of match results. |
| 145 | ncase++ |
| 146 | if re == nil { |
| 147 | // Failed to compile: skip results. |
| 148 | continue |
| 149 | } |
| 150 | if len(input) == 0 { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 151 | t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 152 | } |
| 153 | var text string |
| 154 | text, input = input[0], input[1:] |
| 155 | if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { |
| 156 | // RE2's \B considers every byte position, |
| 157 | // so it sees 'not word boundary' in the |
| 158 | // middle of UTF-8 sequences. This package |
| 159 | // only considers the positions between runes, |
| 160 | // so it disagrees. Skip those cases. |
| 161 | continue |
| 162 | } |
| 163 | res := strings.Split(line, ";") |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 164 | if len(res) != len(run) { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 165 | t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 166 | } |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 167 | for i := range res { |
| 168 | have, suffix := run[i](re, refull, text) |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 169 | want := parseResult(t, file, lineno, res[i]) |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 170 | if !same(have, want) { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 171 | t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 172 | if nfail++; nfail >= 100 { |
| 173 | t.Fatalf("stopping after %d errors", nfail) |
| 174 | } |
| 175 | continue |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 176 | } |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 177 | b, suffix := match[i](re, refull, text) |
| 178 | if b != (want != nil) { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 179 | t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 180 | if nfail++; nfail >= 100 { |
| 181 | t.Fatalf("stopping after %d errors", nfail) |
| 182 | } |
| 183 | continue |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 184 | } |
| 185 | } |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 186 | |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 187 | default: |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 188 | t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 189 | } |
| 190 | } |
Rob Pike | f913830 | 2013-02-20 13:37:45 -0800 | [diff] [blame] | 191 | if err := scanner.Err(); err != nil { |
| 192 | t.Fatalf("%s:%d: %v", file, lineno, err) |
| 193 | } |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 194 | if len(input) != 0 { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 195 | t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 196 | } |
| 197 | t.Logf("%d cases tested", ncase) |
| 198 | } |
| 199 | |
Russ Cox | 7df4322 | 2011-09-08 10:09:25 -0400 | [diff] [blame] | 200 | var run = []func(*Regexp, *Regexp, string) ([]int, string){ |
| 201 | runFull, |
| 202 | runPartial, |
| 203 | runFullLongest, |
| 204 | runPartialLongest, |
| 205 | } |
| 206 | |
| 207 | func runFull(re, refull *Regexp, text string) ([]int, string) { |
| 208 | refull.longest = false |
| 209 | return refull.FindStringSubmatchIndex(text), "[full]" |
| 210 | } |
| 211 | |
| 212 | func runPartial(re, refull *Regexp, text string) ([]int, string) { |
| 213 | re.longest = false |
| 214 | return re.FindStringSubmatchIndex(text), "" |
| 215 | } |
| 216 | |
| 217 | func runFullLongest(re, refull *Regexp, text string) ([]int, string) { |
| 218 | refull.longest = true |
| 219 | return refull.FindStringSubmatchIndex(text), "[full,longest]" |
| 220 | } |
| 221 | |
| 222 | func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { |
| 223 | re.longest = true |
| 224 | return re.FindStringSubmatchIndex(text), "[longest]" |
| 225 | } |
| 226 | |
| 227 | var match = []func(*Regexp, *Regexp, string) (bool, string){ |
| 228 | matchFull, |
| 229 | matchPartial, |
| 230 | matchFullLongest, |
| 231 | matchPartialLongest, |
| 232 | } |
| 233 | |
| 234 | func matchFull(re, refull *Regexp, text string) (bool, string) { |
| 235 | refull.longest = false |
| 236 | return refull.MatchString(text), "[full]" |
| 237 | } |
| 238 | |
| 239 | func matchPartial(re, refull *Regexp, text string) (bool, string) { |
| 240 | re.longest = false |
| 241 | return re.MatchString(text), "" |
| 242 | } |
| 243 | |
| 244 | func matchFullLongest(re, refull *Regexp, text string) (bool, string) { |
| 245 | refull.longest = true |
| 246 | return refull.MatchString(text), "[full,longest]" |
| 247 | } |
| 248 | |
| 249 | func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { |
| 250 | re.longest = true |
| 251 | return re.MatchString(text), "[longest]" |
| 252 | } |
| 253 | |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 254 | func isSingleBytes(s string) bool { |
| 255 | for _, c := range s { |
| 256 | if c >= utf8.RuneSelf { |
| 257 | return false |
| 258 | } |
| 259 | } |
| 260 | return true |
| 261 | } |
| 262 | |
Russ Cox | eb69292 | 2011-11-01 22:05:34 -0400 | [diff] [blame] | 263 | func tryCompile(s string) (re *Regexp, err error) { |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 264 | // Protect against panic during Compile. |
| 265 | defer func() { |
| 266 | if r := recover(); r != nil { |
| 267 | err = fmt.Errorf("panic: %v", r) |
| 268 | } |
| 269 | }() |
| 270 | return Compile(s) |
| 271 | } |
| 272 | |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 273 | func parseResult(t *testing.T, file string, lineno int, res string) []int { |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 274 | // A single - indicates no match. |
| 275 | if res == "-" { |
| 276 | return nil |
| 277 | } |
| 278 | // Otherwise, a space-separated list of pairs. |
| 279 | n := 1 |
| 280 | for j := 0; j < len(res); j++ { |
| 281 | if res[j] == ' ' { |
| 282 | n++ |
| 283 | } |
| 284 | } |
| 285 | out := make([]int, 2*n) |
| 286 | i := 0 |
| 287 | n = 0 |
| 288 | for j := 0; j <= len(res); j++ { |
| 289 | if j == len(res) || res[j] == ' ' { |
| 290 | // Process a single pair. - means no submatch. |
| 291 | pair := res[i:j] |
| 292 | if pair == "-" { |
| 293 | out[n] = -1 |
| 294 | out[n+1] = -1 |
| 295 | } else { |
Brad Fitzpatrick | d8e27db | 2013-08-05 16:27:24 -0700 | [diff] [blame] | 296 | k := strings.Index(pair, "-") |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 297 | if k < 0 { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 298 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 299 | } |
| 300 | lo, err1 := strconv.Atoi(pair[:k]) |
| 301 | hi, err2 := strconv.Atoi(pair[k+1:]) |
| 302 | if err1 != nil || err2 != nil || lo > hi { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 303 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) |
Russ Cox | 08ae1a5 | 2011-09-07 15:48:06 -0400 | [diff] [blame] | 304 | } |
| 305 | out[n] = lo |
| 306 | out[n+1] = hi |
| 307 | } |
| 308 | n += 2 |
| 309 | i = j + 1 |
| 310 | } |
| 311 | } |
| 312 | return out |
| 313 | } |
| 314 | |
| 315 | func same(x, y []int) bool { |
| 316 | if len(x) != len(y) { |
| 317 | return false |
| 318 | } |
| 319 | for i, xi := range x { |
| 320 | if xi != y[i] { |
| 321 | return false |
| 322 | } |
| 323 | } |
| 324 | return true |
| 325 | } |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 326 | |
| 327 | // TestFowler runs this package's regexp API against the |
| 328 | // POSIX regular expression tests collected by Glenn Fowler |
Shenghou Ma | e24e299 | 2015-01-25 19:08:59 -0500 | [diff] [blame] | 329 | // at http://www2.research.att.com/~astopen/testregex/testregex.html. |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 330 | func TestFowler(t *testing.T) { |
| 331 | files, err := filepath.Glob("testdata/*.dat") |
| 332 | if err != nil { |
| 333 | t.Fatal(err) |
| 334 | } |
| 335 | for _, file := range files { |
| 336 | t.Log(file) |
| 337 | testFowler(t, file) |
| 338 | } |
| 339 | } |
| 340 | |
Russ Cox | 66b3fab | 2011-09-08 15:00:49 -0400 | [diff] [blame] | 341 | var notab = MustCompilePOSIX(`[^\t]+`) |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 342 | |
| 343 | func testFowler(t *testing.T, file string) { |
| 344 | f, err := os.Open(file) |
| 345 | if err != nil { |
| 346 | t.Error(err) |
| 347 | return |
| 348 | } |
| 349 | defer f.Close() |
| 350 | b := bufio.NewReader(f) |
| 351 | lineno := 0 |
| 352 | lastRegexp := "" |
| 353 | Reading: |
| 354 | for { |
| 355 | lineno++ |
| 356 | line, err := b.ReadString('\n') |
| 357 | if err != nil { |
Russ Cox | eb69292 | 2011-11-01 22:05:34 -0400 | [diff] [blame] | 358 | if err != io.EOF { |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 359 | t.Errorf("%s:%d: %v", file, lineno, err) |
| 360 | } |
| 361 | break Reading |
| 362 | } |
| 363 | |
Shenghou Ma | e24e299 | 2015-01-25 19:08:59 -0500 | [diff] [blame] | 364 | // http://www2.research.att.com/~astopen/man/man1/testregex.html |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 365 | // |
| 366 | // INPUT FORMAT |
| 367 | // Input lines may be blank, a comment beginning with #, or a test |
| 368 | // specification. A specification is five fields separated by one |
| 369 | // or more tabs. NULL denotes the empty string and NIL denotes the |
| 370 | // 0 pointer. |
| 371 | if line[0] == '#' || line[0] == '\n' { |
| 372 | continue Reading |
| 373 | } |
| 374 | line = line[:len(line)-1] |
| 375 | field := notab.FindAllString(line, -1) |
| 376 | for i, f := range field { |
| 377 | if f == "NULL" { |
| 378 | field[i] = "" |
| 379 | } |
| 380 | if f == "NIL" { |
| 381 | t.Logf("%s:%d: skip: %s", file, lineno, line) |
| 382 | continue Reading |
| 383 | } |
| 384 | } |
| 385 | if len(field) == 0 { |
| 386 | continue Reading |
| 387 | } |
| 388 | |
| 389 | // Field 1: the regex(3) flags to apply, one character per REG_feature |
| 390 | // flag. The test is skipped if REG_feature is not supported by the |
| 391 | // implementation. If the first character is not [BEASKLP] then the |
| 392 | // specification is a global control line. One or more of [BEASKLP] may be |
| 393 | // specified; the test will be repeated for each mode. |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 394 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 395 | // B basic BRE (grep, ed, sed) |
| 396 | // E REG_EXTENDED ERE (egrep) |
| 397 | // A REG_AUGMENTED ARE (egrep with negation) |
| 398 | // S REG_SHELL SRE (sh glob) |
| 399 | // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) |
| 400 | // L REG_LITERAL LRE (fgrep) |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 401 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 402 | // a REG_LEFT|REG_RIGHT implicit ^...$ |
| 403 | // b REG_NOTBOL lhs does not match ^ |
| 404 | // c REG_COMMENT ignore space and #...\n |
| 405 | // d REG_SHELL_DOT explicit leading . match |
| 406 | // e REG_NOTEOL rhs does not match $ |
| 407 | // f REG_MULTIPLE multiple \n separated patterns |
| 408 | // g FNM_LEADING_DIR testfnmatch only -- match until / |
| 409 | // h REG_MULTIREF multiple digit backref |
| 410 | // i REG_ICASE ignore case |
| 411 | // j REG_SPAN . matches \n |
| 412 | // k REG_ESCAPE \ to ecape [...] delimiter |
| 413 | // l REG_LEFT implicit ^... |
| 414 | // m REG_MINIMAL minimal match |
| 415 | // n REG_NEWLINE explicit \n match |
| 416 | // o REG_ENCLOSED (|&) magic inside [@|&](...) |
| 417 | // p REG_SHELL_PATH explicit / match |
| 418 | // q REG_DELIMITED delimited pattern |
| 419 | // r REG_RIGHT implicit ...$ |
| 420 | // s REG_SHELL_ESCAPED \ not special |
| 421 | // t REG_MUSTDELIM all delimiters must be specified |
| 422 | // u standard unspecified behavior -- errors not counted |
| 423 | // v REG_CLASS_ESCAPE \ special inside [...] |
| 424 | // w REG_NOSUB no subexpression match array |
| 425 | // x REG_LENIENT let some errors slide |
| 426 | // y REG_LEFT regexec() implicit ^... |
| 427 | // z REG_NULL NULL subexpressions ok |
| 428 | // $ expand C \c escapes in fields 2 and 3 |
| 429 | // / field 2 is a regsubcomp() expression |
| 430 | // = field 3 is a regdecomp() expression |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 431 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 432 | // Field 1 control lines: |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 433 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 434 | // C set LC_COLLATE and LC_CTYPE to locale in field 2 |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 435 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 436 | // ?test ... output field 5 if passed and != EXPECTED, silent otherwise |
| 437 | // &test ... output field 5 if current and previous passed |
| 438 | // |test ... output field 5 if current passed and previous failed |
| 439 | // ; ... output field 2 if previous failed |
| 440 | // {test ... skip if failed until } |
| 441 | // } end of skip |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 442 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 443 | // : comment comment copied as output NOTE |
| 444 | // :comment:test :comment: ignored |
| 445 | // N[OTE] comment comment copied as output NOTE |
| 446 | // T[EST] comment comment |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 447 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 448 | // number use number for nmatch (20 by default) |
| 449 | flag := field[0] |
| 450 | switch flag[0] { |
| 451 | case '?', '&', '|', ';', '{', '}': |
| 452 | // Ignore all the control operators. |
| 453 | // Just run everything. |
| 454 | flag = flag[1:] |
| 455 | if flag == "" { |
| 456 | continue Reading |
| 457 | } |
| 458 | case ':': |
Brad Fitzpatrick | d8e27db | 2013-08-05 16:27:24 -0700 | [diff] [blame] | 459 | i := strings.Index(flag[1:], ":") |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 460 | if i < 0 { |
| 461 | t.Logf("skip: %s", line) |
| 462 | continue Reading |
| 463 | } |
| 464 | flag = flag[1+i+1:] |
| 465 | case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': |
| 466 | t.Logf("skip: %s", line) |
| 467 | continue Reading |
| 468 | } |
| 469 | |
| 470 | // Can check field count now that we've handled the myriad comment formats. |
| 471 | if len(field) < 4 { |
| 472 | t.Errorf("%s:%d: too few fields: %s", file, lineno, line) |
| 473 | continue Reading |
| 474 | } |
| 475 | |
| 476 | // Expand C escapes (a.k.a. Go escapes). |
| 477 | if strings.Contains(flag, "$") { |
| 478 | f := `"` + field[1] + `"` |
| 479 | if field[1], err = strconv.Unquote(f); err != nil { |
| 480 | t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) |
| 481 | } |
| 482 | f = `"` + field[2] + `"` |
| 483 | if field[2], err = strconv.Unquote(f); err != nil { |
| 484 | t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) |
| 485 | } |
| 486 | } |
| 487 | |
| 488 | // Field 2: the regular expression pattern; SAME uses the pattern from |
| 489 | // the previous specification. |
Robert Griesemer | 465b9c3 | 2012-10-30 13:38:01 -0700 | [diff] [blame] | 490 | // |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 491 | if field[1] == "SAME" { |
| 492 | field[1] = lastRegexp |
| 493 | } |
| 494 | lastRegexp = field[1] |
| 495 | |
| 496 | // Field 3: the string to match. |
| 497 | text := field[2] |
| 498 | |
| 499 | // Field 4: the test outcome... |
| 500 | ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) |
| 501 | if !ok { |
| 502 | t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) |
| 503 | continue Reading |
| 504 | } |
| 505 | |
| 506 | // Field 5: optional comment appended to the report. |
| 507 | |
| 508 | Testing: |
| 509 | // Run test once for each specified capital letter mode that we support. |
| 510 | for _, c := range flag { |
| 511 | pattern := field[1] |
| 512 | syn := syntax.POSIX | syntax.ClassNL |
| 513 | switch c { |
| 514 | default: |
| 515 | continue Testing |
| 516 | case 'E': |
| 517 | // extended regexp (what we support) |
| 518 | case 'L': |
| 519 | // literal |
| 520 | pattern = QuoteMeta(pattern) |
| 521 | } |
| 522 | |
| 523 | for _, c := range flag { |
| 524 | switch c { |
| 525 | case 'i': |
| 526 | syn |= syntax.FoldCase |
| 527 | } |
| 528 | } |
| 529 | |
| 530 | re, err := compile(pattern, syn, true) |
| 531 | if err != nil { |
| 532 | if shouldCompile { |
| 533 | t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) |
| 534 | } |
| 535 | continue Testing |
| 536 | } |
| 537 | if !shouldCompile { |
| 538 | t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) |
| 539 | continue Testing |
| 540 | } |
| 541 | match := re.MatchString(text) |
| 542 | if match != shouldMatch { |
| 543 | t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) |
| 544 | continue Testing |
| 545 | } |
| 546 | have := re.FindStringSubmatchIndex(text) |
| 547 | if (len(have) > 0) != match { |
Rob Pike | b47bbec | 2011-09-14 13:29:31 -0700 | [diff] [blame] | 548 | t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 549 | continue Testing |
| 550 | } |
| 551 | if len(have) > len(pos) { |
| 552 | have = have[:len(pos)] |
| 553 | } |
| 554 | if !same(have, pos) { |
| 555 | t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) |
| 556 | } |
| 557 | } |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { |
| 562 | // Field 4: the test outcome. This is either one of the posix error |
| 563 | // codes (with REG_ omitted) or the match array, a list of (m,n) |
| 564 | // entries with m and n being first and last+1 positions in the |
| 565 | // field 3 string, or NULL if REG_NOSUB is in effect and success |
| 566 | // is expected. BADPAT is acceptable in place of any regcomp(3) |
| 567 | // error code. The match[] array is initialized to (-2,-2) before |
| 568 | // each test. All array elements from 0 to nmatch-1 must be specified |
| 569 | // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. |
| 570 | // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a |
| 571 | // matched (?{...}) expression, where x is the text enclosed by {...}, |
| 572 | // o is the expression ordinal counting from 1, and n is the length of |
| 573 | // the unmatched portion of the subject string. If x starts with a |
| 574 | // number then that is the return value of re_execf(), otherwise 0 is |
| 575 | // returned. |
| 576 | switch { |
| 577 | case s == "": |
| 578 | // Match with no position information. |
| 579 | ok = true |
| 580 | compiled = true |
| 581 | matched = true |
| 582 | return |
| 583 | case s == "NOMATCH": |
| 584 | // Match failure. |
| 585 | ok = true |
| 586 | compiled = true |
| 587 | matched = false |
| 588 | return |
| 589 | case 'A' <= s[0] && s[0] <= 'Z': |
| 590 | // All the other error codes are compile errors. |
| 591 | ok = true |
| 592 | compiled = false |
| 593 | return |
| 594 | } |
| 595 | compiled = true |
| 596 | |
| 597 | var x []int |
| 598 | for s != "" { |
| 599 | var end byte = ')' |
| 600 | if len(x)%2 == 0 { |
| 601 | if s[0] != '(' { |
| 602 | ok = false |
| 603 | return |
| 604 | } |
| 605 | s = s[1:] |
| 606 | end = ',' |
| 607 | } |
| 608 | i := 0 |
| 609 | for i < len(s) && s[i] != end { |
| 610 | i++ |
| 611 | } |
| 612 | if i == 0 || i == len(s) { |
| 613 | ok = false |
| 614 | return |
| 615 | } |
| 616 | var v = -1 |
Russ Cox | eb69292 | 2011-11-01 22:05:34 -0400 | [diff] [blame] | 617 | var err error |
Russ Cox | 21e671d | 2011-09-08 14:49:51 -0400 | [diff] [blame] | 618 | if s[:i] != "?" { |
| 619 | v, err = strconv.Atoi(s[:i]) |
| 620 | if err != nil { |
| 621 | ok = false |
| 622 | return |
| 623 | } |
| 624 | } |
| 625 | x = append(x, v) |
| 626 | s = s[i+1:] |
| 627 | } |
| 628 | if len(x)%2 != 0 { |
| 629 | ok = false |
| 630 | return |
| 631 | } |
| 632 | ok = true |
| 633 | matched = true |
| 634 | pos = x |
| 635 | return |
| 636 | } |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 637 | |
| 638 | var text []byte |
| 639 | |
| 640 | func makeText(n int) []byte { |
| 641 | if len(text) >= n { |
| 642 | return text[:n] |
| 643 | } |
| 644 | text = make([]byte, n) |
Rémy Oudompheng | 21b9d14 | 2013-07-20 23:31:51 +0200 | [diff] [blame] | 645 | x := ^uint32(0) |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 646 | for i := range text { |
Rémy Oudompheng | 21b9d14 | 2013-07-20 23:31:51 +0200 | [diff] [blame] | 647 | x += x |
| 648 | x ^= 1 |
| 649 | if int32(x) < 0 { |
| 650 | x ^= 0x88888eef |
| 651 | } |
| 652 | if x%31 == 0 { |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 653 | text[i] = '\n' |
| 654 | } else { |
Rémy Oudompheng | 21b9d14 | 2013-07-20 23:31:51 +0200 | [diff] [blame] | 655 | text[i] = byte(x%(0x7E+1-0x20) + 0x20) |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 656 | } |
| 657 | } |
| 658 | return text |
| 659 | } |
| 660 | |
| 661 | func benchmark(b *testing.B, re string, n int) { |
| 662 | r := MustCompile(re) |
| 663 | t := makeText(n) |
| 664 | b.ResetTimer() |
| 665 | b.SetBytes(int64(n)) |
| 666 | for i := 0; i < b.N; i++ { |
| 667 | if r.Match(t) { |
Rob Pike | 6b77246 | 2011-12-20 10:36:25 -0800 | [diff] [blame] | 668 | b.Fatal("match!") |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 669 | } |
| 670 | } |
| 671 | } |
| 672 | |
Russ Cox | 8f699a3 | 2011-09-28 12:00:31 -0400 | [diff] [blame] | 673 | const ( |
| 674 | easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" |
| 675 | easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" |
| 676 | medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" |
| 677 | hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" |
| 678 | parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" + |
| 679 | "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" |
| 680 | ) |
| 681 | |
Russ Cox | 2f2cc24 | 2011-12-07 15:03:05 -0500 | [diff] [blame] | 682 | func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) } |
| 683 | func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } |
| 684 | func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } |
| 685 | func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } |
| 686 | func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } |
| 687 | func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } |
| 688 | func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } |
| 689 | func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } |
| 690 | func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } |
| 691 | func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } |
Brad Fitzpatrick | c4aa9c5 | 2013-08-29 13:55:30 -0700 | [diff] [blame] | 692 | func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 32<<0) } |
Russ Cox | 2f2cc24 | 2011-12-07 15:03:05 -0500 | [diff] [blame] | 693 | func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } |
| 694 | func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } |
| 695 | func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } |
| 696 | func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } |
| 697 | func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) } |
| 698 | func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } |
| 699 | func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } |
| 700 | func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } |
| 701 | func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) } |
Andrew Gerrand | f41ffc2 | 2013-02-04 15:28:55 +1100 | [diff] [blame] | 702 | |
| 703 | func TestLongest(t *testing.T) { |
| 704 | re, err := Compile(`a(|b)`) |
| 705 | if err != nil { |
| 706 | t.Fatal(err) |
| 707 | } |
| 708 | if g, w := re.FindString("ab"), "a"; g != w { |
| 709 | t.Errorf("first match was %q, want %q", g, w) |
| 710 | } |
| 711 | re.Longest() |
| 712 | if g, w := re.FindString("ab"), "ab"; g != w { |
| 713 | t.Errorf("longest match was %q, want %q", g, w) |
| 714 | } |
| 715 | } |
Matthew Brennan | a513088 | 2015-04-03 20:09:53 -0400 | [diff] [blame] | 716 | |
Michael Matloob | 71e83b8 | 2015-06-14 09:57:46 -0700 | [diff] [blame] | 717 | // TestProgramTooLongForBacktrack tests that a regex which is too long |
Matthew Brennan | a513088 | 2015-04-03 20:09:53 -0400 | [diff] [blame] | 718 | // for the backtracker still executes properly. |
| 719 | func TestProgramTooLongForBacktrack(t *testing.T) { |
| 720 | longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`) |
| 721 | if !longRegex.MatchString("two") { |
| 722 | t.Errorf("longRegex.MatchString(\"two\") was false, want true") |
| 723 | } |
| 724 | if longRegex.MatchString("xxx") { |
| 725 | t.Errorf("longRegex.MatchString(\"xxx\") was true, want false") |
| 726 | } |
| 727 | } |