blob: 4872cb3def44932859af81b4ba480a07232d224b [file] [log] [blame]
Russ Cox08ae1a52011-09-07 15:48:06 -04001// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package regexp
6
7import (
8 "bufio"
Russ Cox21e671d2011-09-08 14:49:51 -04009 "compress/bzip2"
Russ Cox08ae1a52011-09-07 15:48:06 -040010 "fmt"
Russ Cox21e671d2011-09-08 14:49:51 -040011 "io"
Russ Cox08ae1a52011-09-07 15:48:06 -040012 "os"
Russ Cox21e671d2011-09-08 14:49:51 -040013 "path/filepath"
Russ Cox6c230fb2011-09-26 18:33:13 -040014 "regexp/syntax"
Russ Cox08ae1a52011-09-07 15:48:06 -040015 "strconv"
16 "strings"
17 "testing"
Rob Pike45e3bcb2011-11-08 15:41:54 -080018 "unicode/utf8"
Russ Cox08ae1a52011-09-07 15:48:06 -040019)
20
21// TestRE2 tests this package's regexp API against test cases
22// considered during RE2's exhaustive tests, which run all possible
23// regexps over a given set of atoms and operators, up to a given
24// complexity, over all possible strings over a given alphabet,
25// up to a given size. Rather than try to link with RE2, we read a
26// log file containing the test cases and the expected matches.
Dmitry Savintsev8cb9c212015-04-27 11:57:21 +020027// The log file, re2-exhaustive.txt, is generated by running 'make log'
28// in the open source RE2 distribution https://github.com/google/re2/.
Russ Cox08ae1a52011-09-07 15:48:06 -040029//
30// The test file format is a sequence of stanzas like:
31//
32// strings
33// "abc"
34// "123x"
35// regexps
36// "[a-z]+"
37// 0-3;0-3
38// -;-
39// "([0-9])([0-9])([0-9])"
40// -;-
41// -;0-3 0-1 1-2 2-3
42//
43// The stanza begins by defining a set of strings, quoted
44// using Go double-quote syntax, one per line. Then the
45// regexps section gives a sequence of regexps to run on
46// the strings. In the block that follows a regexp, each line
47// gives the semicolon-separated match results of running
48// the regexp on the corresponding string.
49// Each match result is either a single -, meaning no match, or a
50// space-separated sequence of pairs giving the match and
51// submatch indices. An unmatched subexpression formats
52// its pair as a single - (not illustrated above). For now
53// each regexp run produces two match results, one for a
54// ``full match'' that restricts the regexp to matching the entire
55// string or nothing, and one for a ``partial match'' that gives
56// the leftmost first match found in the string.
57//
58// Lines beginning with # are comments. Lines beginning with
59// a capital letter are test names printed during RE2's test suite
60// and are echoed into t but otherwise ignored.
61//
Dmitry Savintsev8cb9c212015-04-27 11:57:21 +020062// At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB,
63// so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly.
Russ Cox08ae1a52011-09-07 15:48:06 -040064//
Russ Cox21e671d2011-09-08 14:49:51 -040065func TestRE2Search(t *testing.T) {
66 testRE2(t, "testdata/re2-search.txt")
67}
68
Russ Cox21e671d2011-09-08 14:49:51 -040069func testRE2(t *testing.T, file string) {
70 f, err := os.Open(file)
Russ Cox08ae1a52011-09-07 15:48:06 -040071 if err != nil {
72 t.Fatal(err)
73 }
74 defer f.Close()
Russ Cox21e671d2011-09-08 14:49:51 -040075 var txt io.Reader
76 if strings.HasSuffix(file, ".bz2") {
77 z := bzip2.NewReader(f)
78 txt = z
79 file = file[:len(file)-len(".bz2")] // for error messages
80 } else {
81 txt = f
Russ Cox08ae1a52011-09-07 15:48:06 -040082 }
Russ Cox08ae1a52011-09-07 15:48:06 -040083 lineno := 0
Rob Pikef9138302013-02-20 13:37:45 -080084 scanner := bufio.NewScanner(txt)
Russ Cox08ae1a52011-09-07 15:48:06 -040085 var (
86 str []string
87 input []string
88 inStrings bool
89 re *Regexp
90 refull *Regexp
91 nfail int
92 ncase int
93 )
Rob Pikef9138302013-02-20 13:37:45 -080094 for lineno := 1; scanner.Scan(); lineno++ {
95 line := scanner.Text()
Russ Cox08ae1a52011-09-07 15:48:06 -040096 switch {
97 case line == "":
Russ Cox21e671d2011-09-08 14:49:51 -040098 t.Fatalf("%s:%d: unexpected blank line", file, lineno)
Russ Cox08ae1a52011-09-07 15:48:06 -040099 case line[0] == '#':
100 continue
101 case 'A' <= line[0] && line[0] <= 'Z':
102 // Test name.
103 t.Logf("%s\n", line)
104 continue
105 case line == "strings":
106 str = str[:0]
107 inStrings = true
108 case line == "regexps":
109 inStrings = false
110 case line[0] == '"':
111 q, err := strconv.Unquote(line)
112 if err != nil {
113 // Fatal because we'll get out of sync.
Russ Cox21e671d2011-09-08 14:49:51 -0400114 t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
Russ Cox08ae1a52011-09-07 15:48:06 -0400115 }
116 if inStrings {
117 str = append(str, q)
118 continue
119 }
120 // Is a regexp.
121 if len(input) != 0 {
Russ Cox21e671d2011-09-08 14:49:51 -0400122 t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
Russ Cox08ae1a52011-09-07 15:48:06 -0400123 }
124 re, err = tryCompile(q)
125 if err != nil {
Russ Coxeb692922011-11-01 22:05:34 -0400126 if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
Russ Cox08ae1a52011-09-07 15:48:06 -0400127 // We don't and likely never will support \C; keep going.
128 continue
129 }
Russ Cox21e671d2011-09-08 14:49:51 -0400130 t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
Russ Cox08ae1a52011-09-07 15:48:06 -0400131 if nfail++; nfail >= 100 {
132 t.Fatalf("stopping after %d errors", nfail)
133 }
134 continue
135 }
136 full := `\A(?:` + q + `)\z`
137 refull, err = tryCompile(full)
138 if err != nil {
139 // Fatal because q worked, so this should always work.
Russ Cox21e671d2011-09-08 14:49:51 -0400140 t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
Russ Cox08ae1a52011-09-07 15:48:06 -0400141 }
142 input = str
143 case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
144 // A sequence of match results.
145 ncase++
146 if re == nil {
147 // Failed to compile: skip results.
148 continue
149 }
150 if len(input) == 0 {
Russ Cox21e671d2011-09-08 14:49:51 -0400151 t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
Russ Cox08ae1a52011-09-07 15:48:06 -0400152 }
153 var text string
154 text, input = input[0], input[1:]
155 if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
156 // RE2's \B considers every byte position,
157 // so it sees 'not word boundary' in the
158 // middle of UTF-8 sequences. This package
159 // only considers the positions between runes,
160 // so it disagrees. Skip those cases.
161 continue
162 }
163 res := strings.Split(line, ";")
Russ Cox7df43222011-09-08 10:09:25 -0400164 if len(res) != len(run) {
Russ Cox21e671d2011-09-08 14:49:51 -0400165 t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
Russ Cox08ae1a52011-09-07 15:48:06 -0400166 }
Russ Cox7df43222011-09-08 10:09:25 -0400167 for i := range res {
168 have, suffix := run[i](re, refull, text)
Russ Cox21e671d2011-09-08 14:49:51 -0400169 want := parseResult(t, file, lineno, res[i])
Russ Cox7df43222011-09-08 10:09:25 -0400170 if !same(have, want) {
Russ Cox21e671d2011-09-08 14:49:51 -0400171 t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
Russ Cox7df43222011-09-08 10:09:25 -0400172 if nfail++; nfail >= 100 {
173 t.Fatalf("stopping after %d errors", nfail)
174 }
175 continue
Russ Cox08ae1a52011-09-07 15:48:06 -0400176 }
Russ Cox7df43222011-09-08 10:09:25 -0400177 b, suffix := match[i](re, refull, text)
178 if b != (want != nil) {
Russ Cox21e671d2011-09-08 14:49:51 -0400179 t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
Russ Cox7df43222011-09-08 10:09:25 -0400180 if nfail++; nfail >= 100 {
181 t.Fatalf("stopping after %d errors", nfail)
182 }
183 continue
Russ Cox08ae1a52011-09-07 15:48:06 -0400184 }
185 }
Russ Cox7df43222011-09-08 10:09:25 -0400186
Russ Cox08ae1a52011-09-07 15:48:06 -0400187 default:
Russ Cox21e671d2011-09-08 14:49:51 -0400188 t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
Russ Cox08ae1a52011-09-07 15:48:06 -0400189 }
190 }
Rob Pikef9138302013-02-20 13:37:45 -0800191 if err := scanner.Err(); err != nil {
192 t.Fatalf("%s:%d: %v", file, lineno, err)
193 }
Russ Cox08ae1a52011-09-07 15:48:06 -0400194 if len(input) != 0 {
Russ Cox21e671d2011-09-08 14:49:51 -0400195 t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
Russ Cox08ae1a52011-09-07 15:48:06 -0400196 }
197 t.Logf("%d cases tested", ncase)
198}
199
Russ Cox7df43222011-09-08 10:09:25 -0400200var run = []func(*Regexp, *Regexp, string) ([]int, string){
201 runFull,
202 runPartial,
203 runFullLongest,
204 runPartialLongest,
205}
206
207func runFull(re, refull *Regexp, text string) ([]int, string) {
208 refull.longest = false
209 return refull.FindStringSubmatchIndex(text), "[full]"
210}
211
212func runPartial(re, refull *Regexp, text string) ([]int, string) {
213 re.longest = false
214 return re.FindStringSubmatchIndex(text), ""
215}
216
217func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
218 refull.longest = true
219 return refull.FindStringSubmatchIndex(text), "[full,longest]"
220}
221
222func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
223 re.longest = true
224 return re.FindStringSubmatchIndex(text), "[longest]"
225}
226
227var match = []func(*Regexp, *Regexp, string) (bool, string){
228 matchFull,
229 matchPartial,
230 matchFullLongest,
231 matchPartialLongest,
232}
233
234func matchFull(re, refull *Regexp, text string) (bool, string) {
235 refull.longest = false
236 return refull.MatchString(text), "[full]"
237}
238
239func matchPartial(re, refull *Regexp, text string) (bool, string) {
240 re.longest = false
241 return re.MatchString(text), ""
242}
243
244func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
245 refull.longest = true
246 return refull.MatchString(text), "[full,longest]"
247}
248
249func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
250 re.longest = true
251 return re.MatchString(text), "[longest]"
252}
253
Russ Cox08ae1a52011-09-07 15:48:06 -0400254func isSingleBytes(s string) bool {
255 for _, c := range s {
256 if c >= utf8.RuneSelf {
257 return false
258 }
259 }
260 return true
261}
262
Russ Coxeb692922011-11-01 22:05:34 -0400263func tryCompile(s string) (re *Regexp, err error) {
Russ Cox08ae1a52011-09-07 15:48:06 -0400264 // Protect against panic during Compile.
265 defer func() {
266 if r := recover(); r != nil {
267 err = fmt.Errorf("panic: %v", r)
268 }
269 }()
270 return Compile(s)
271}
272
Russ Cox21e671d2011-09-08 14:49:51 -0400273func parseResult(t *testing.T, file string, lineno int, res string) []int {
Russ Cox08ae1a52011-09-07 15:48:06 -0400274 // A single - indicates no match.
275 if res == "-" {
276 return nil
277 }
278 // Otherwise, a space-separated list of pairs.
279 n := 1
280 for j := 0; j < len(res); j++ {
281 if res[j] == ' ' {
282 n++
283 }
284 }
285 out := make([]int, 2*n)
286 i := 0
287 n = 0
288 for j := 0; j <= len(res); j++ {
289 if j == len(res) || res[j] == ' ' {
290 // Process a single pair. - means no submatch.
291 pair := res[i:j]
292 if pair == "-" {
293 out[n] = -1
294 out[n+1] = -1
295 } else {
Brad Fitzpatrickd8e27db2013-08-05 16:27:24 -0700296 k := strings.Index(pair, "-")
Russ Cox08ae1a52011-09-07 15:48:06 -0400297 if k < 0 {
Russ Cox21e671d2011-09-08 14:49:51 -0400298 t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
Russ Cox08ae1a52011-09-07 15:48:06 -0400299 }
300 lo, err1 := strconv.Atoi(pair[:k])
301 hi, err2 := strconv.Atoi(pair[k+1:])
302 if err1 != nil || err2 != nil || lo > hi {
Russ Cox21e671d2011-09-08 14:49:51 -0400303 t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
Russ Cox08ae1a52011-09-07 15:48:06 -0400304 }
305 out[n] = lo
306 out[n+1] = hi
307 }
308 n += 2
309 i = j + 1
310 }
311 }
312 return out
313}
314
315func same(x, y []int) bool {
316 if len(x) != len(y) {
317 return false
318 }
319 for i, xi := range x {
320 if xi != y[i] {
321 return false
322 }
323 }
324 return true
325}
Russ Cox21e671d2011-09-08 14:49:51 -0400326
327// TestFowler runs this package's regexp API against the
328// POSIX regular expression tests collected by Glenn Fowler
Shenghou Mae24e2992015-01-25 19:08:59 -0500329// at http://www2.research.att.com/~astopen/testregex/testregex.html.
Russ Cox21e671d2011-09-08 14:49:51 -0400330func TestFowler(t *testing.T) {
331 files, err := filepath.Glob("testdata/*.dat")
332 if err != nil {
333 t.Fatal(err)
334 }
335 for _, file := range files {
336 t.Log(file)
337 testFowler(t, file)
338 }
339}
340
Russ Cox66b3fab2011-09-08 15:00:49 -0400341var notab = MustCompilePOSIX(`[^\t]+`)
Russ Cox21e671d2011-09-08 14:49:51 -0400342
343func testFowler(t *testing.T, file string) {
344 f, err := os.Open(file)
345 if err != nil {
346 t.Error(err)
347 return
348 }
349 defer f.Close()
350 b := bufio.NewReader(f)
351 lineno := 0
352 lastRegexp := ""
353Reading:
354 for {
355 lineno++
356 line, err := b.ReadString('\n')
357 if err != nil {
Russ Coxeb692922011-11-01 22:05:34 -0400358 if err != io.EOF {
Russ Cox21e671d2011-09-08 14:49:51 -0400359 t.Errorf("%s:%d: %v", file, lineno, err)
360 }
361 break Reading
362 }
363
Shenghou Mae24e2992015-01-25 19:08:59 -0500364 // http://www2.research.att.com/~astopen/man/man1/testregex.html
Russ Cox21e671d2011-09-08 14:49:51 -0400365 //
366 // INPUT FORMAT
367 // Input lines may be blank, a comment beginning with #, or a test
368 // specification. A specification is five fields separated by one
369 // or more tabs. NULL denotes the empty string and NIL denotes the
370 // 0 pointer.
371 if line[0] == '#' || line[0] == '\n' {
372 continue Reading
373 }
374 line = line[:len(line)-1]
375 field := notab.FindAllString(line, -1)
376 for i, f := range field {
377 if f == "NULL" {
378 field[i] = ""
379 }
380 if f == "NIL" {
381 t.Logf("%s:%d: skip: %s", file, lineno, line)
382 continue Reading
383 }
384 }
385 if len(field) == 0 {
386 continue Reading
387 }
388
389 // Field 1: the regex(3) flags to apply, one character per REG_feature
390 // flag. The test is skipped if REG_feature is not supported by the
391 // implementation. If the first character is not [BEASKLP] then the
392 // specification is a global control line. One or more of [BEASKLP] may be
393 // specified; the test will be repeated for each mode.
Robert Griesemer465b9c32012-10-30 13:38:01 -0700394 //
Russ Cox21e671d2011-09-08 14:49:51 -0400395 // B basic BRE (grep, ed, sed)
396 // E REG_EXTENDED ERE (egrep)
397 // A REG_AUGMENTED ARE (egrep with negation)
398 // S REG_SHELL SRE (sh glob)
399 // K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
400 // L REG_LITERAL LRE (fgrep)
Robert Griesemer465b9c32012-10-30 13:38:01 -0700401 //
Russ Cox21e671d2011-09-08 14:49:51 -0400402 // a REG_LEFT|REG_RIGHT implicit ^...$
403 // b REG_NOTBOL lhs does not match ^
404 // c REG_COMMENT ignore space and #...\n
405 // d REG_SHELL_DOT explicit leading . match
406 // e REG_NOTEOL rhs does not match $
407 // f REG_MULTIPLE multiple \n separated patterns
408 // g FNM_LEADING_DIR testfnmatch only -- match until /
409 // h REG_MULTIREF multiple digit backref
410 // i REG_ICASE ignore case
411 // j REG_SPAN . matches \n
412 // k REG_ESCAPE \ to ecape [...] delimiter
413 // l REG_LEFT implicit ^...
414 // m REG_MINIMAL minimal match
415 // n REG_NEWLINE explicit \n match
416 // o REG_ENCLOSED (|&) magic inside [@|&](...)
417 // p REG_SHELL_PATH explicit / match
418 // q REG_DELIMITED delimited pattern
419 // r REG_RIGHT implicit ...$
420 // s REG_SHELL_ESCAPED \ not special
421 // t REG_MUSTDELIM all delimiters must be specified
422 // u standard unspecified behavior -- errors not counted
423 // v REG_CLASS_ESCAPE \ special inside [...]
424 // w REG_NOSUB no subexpression match array
425 // x REG_LENIENT let some errors slide
426 // y REG_LEFT regexec() implicit ^...
427 // z REG_NULL NULL subexpressions ok
428 // $ expand C \c escapes in fields 2 and 3
429 // / field 2 is a regsubcomp() expression
430 // = field 3 is a regdecomp() expression
Robert Griesemer465b9c32012-10-30 13:38:01 -0700431 //
Russ Cox21e671d2011-09-08 14:49:51 -0400432 // Field 1 control lines:
Robert Griesemer465b9c32012-10-30 13:38:01 -0700433 //
Russ Cox21e671d2011-09-08 14:49:51 -0400434 // C set LC_COLLATE and LC_CTYPE to locale in field 2
Robert Griesemer465b9c32012-10-30 13:38:01 -0700435 //
Russ Cox21e671d2011-09-08 14:49:51 -0400436 // ?test ... output field 5 if passed and != EXPECTED, silent otherwise
437 // &test ... output field 5 if current and previous passed
438 // |test ... output field 5 if current passed and previous failed
439 // ; ... output field 2 if previous failed
440 // {test ... skip if failed until }
441 // } end of skip
Robert Griesemer465b9c32012-10-30 13:38:01 -0700442 //
Russ Cox21e671d2011-09-08 14:49:51 -0400443 // : comment comment copied as output NOTE
444 // :comment:test :comment: ignored
445 // N[OTE] comment comment copied as output NOTE
446 // T[EST] comment comment
Robert Griesemer465b9c32012-10-30 13:38:01 -0700447 //
Russ Cox21e671d2011-09-08 14:49:51 -0400448 // number use number for nmatch (20 by default)
449 flag := field[0]
450 switch flag[0] {
451 case '?', '&', '|', ';', '{', '}':
452 // Ignore all the control operators.
453 // Just run everything.
454 flag = flag[1:]
455 if flag == "" {
456 continue Reading
457 }
458 case ':':
Brad Fitzpatrickd8e27db2013-08-05 16:27:24 -0700459 i := strings.Index(flag[1:], ":")
Russ Cox21e671d2011-09-08 14:49:51 -0400460 if i < 0 {
461 t.Logf("skip: %s", line)
462 continue Reading
463 }
464 flag = flag[1+i+1:]
465 case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
466 t.Logf("skip: %s", line)
467 continue Reading
468 }
469
470 // Can check field count now that we've handled the myriad comment formats.
471 if len(field) < 4 {
472 t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
473 continue Reading
474 }
475
476 // Expand C escapes (a.k.a. Go escapes).
477 if strings.Contains(flag, "$") {
478 f := `"` + field[1] + `"`
479 if field[1], err = strconv.Unquote(f); err != nil {
480 t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
481 }
482 f = `"` + field[2] + `"`
483 if field[2], err = strconv.Unquote(f); err != nil {
484 t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
485 }
486 }
487
488 // Field 2: the regular expression pattern; SAME uses the pattern from
489 // the previous specification.
Robert Griesemer465b9c32012-10-30 13:38:01 -0700490 //
Russ Cox21e671d2011-09-08 14:49:51 -0400491 if field[1] == "SAME" {
492 field[1] = lastRegexp
493 }
494 lastRegexp = field[1]
495
496 // Field 3: the string to match.
497 text := field[2]
498
499 // Field 4: the test outcome...
500 ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
501 if !ok {
502 t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
503 continue Reading
504 }
505
506 // Field 5: optional comment appended to the report.
507
508 Testing:
509 // Run test once for each specified capital letter mode that we support.
510 for _, c := range flag {
511 pattern := field[1]
512 syn := syntax.POSIX | syntax.ClassNL
513 switch c {
514 default:
515 continue Testing
516 case 'E':
517 // extended regexp (what we support)
518 case 'L':
519 // literal
520 pattern = QuoteMeta(pattern)
521 }
522
523 for _, c := range flag {
524 switch c {
525 case 'i':
526 syn |= syntax.FoldCase
527 }
528 }
529
530 re, err := compile(pattern, syn, true)
531 if err != nil {
532 if shouldCompile {
533 t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
534 }
535 continue Testing
536 }
537 if !shouldCompile {
538 t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
539 continue Testing
540 }
541 match := re.MatchString(text)
542 if match != shouldMatch {
543 t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
544 continue Testing
545 }
546 have := re.FindStringSubmatchIndex(text)
547 if (len(have) > 0) != match {
Rob Pikeb47bbec2011-09-14 13:29:31 -0700548 t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
Russ Cox21e671d2011-09-08 14:49:51 -0400549 continue Testing
550 }
551 if len(have) > len(pos) {
552 have = have[:len(pos)]
553 }
554 if !same(have, pos) {
555 t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
556 }
557 }
558 }
559}
560
561func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
562 // Field 4: the test outcome. This is either one of the posix error
563 // codes (with REG_ omitted) or the match array, a list of (m,n)
564 // entries with m and n being first and last+1 positions in the
565 // field 3 string, or NULL if REG_NOSUB is in effect and success
566 // is expected. BADPAT is acceptable in place of any regcomp(3)
567 // error code. The match[] array is initialized to (-2,-2) before
568 // each test. All array elements from 0 to nmatch-1 must be specified
569 // in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
570 // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
571 // matched (?{...}) expression, where x is the text enclosed by {...},
572 // o is the expression ordinal counting from 1, and n is the length of
573 // the unmatched portion of the subject string. If x starts with a
574 // number then that is the return value of re_execf(), otherwise 0 is
575 // returned.
576 switch {
577 case s == "":
578 // Match with no position information.
579 ok = true
580 compiled = true
581 matched = true
582 return
583 case s == "NOMATCH":
584 // Match failure.
585 ok = true
586 compiled = true
587 matched = false
588 return
589 case 'A' <= s[0] && s[0] <= 'Z':
590 // All the other error codes are compile errors.
591 ok = true
592 compiled = false
593 return
594 }
595 compiled = true
596
597 var x []int
598 for s != "" {
599 var end byte = ')'
600 if len(x)%2 == 0 {
601 if s[0] != '(' {
602 ok = false
603 return
604 }
605 s = s[1:]
606 end = ','
607 }
608 i := 0
609 for i < len(s) && s[i] != end {
610 i++
611 }
612 if i == 0 || i == len(s) {
613 ok = false
614 return
615 }
616 var v = -1
Russ Coxeb692922011-11-01 22:05:34 -0400617 var err error
Russ Cox21e671d2011-09-08 14:49:51 -0400618 if s[:i] != "?" {
619 v, err = strconv.Atoi(s[:i])
620 if err != nil {
621 ok = false
622 return
623 }
624 }
625 x = append(x, v)
626 s = s[i+1:]
627 }
628 if len(x)%2 != 0 {
629 ok = false
630 return
631 }
632 ok = true
633 matched = true
634 pos = x
635 return
636}
Russ Cox8f699a32011-09-28 12:00:31 -0400637
638var text []byte
639
640func makeText(n int) []byte {
641 if len(text) >= n {
642 return text[:n]
643 }
644 text = make([]byte, n)
Rémy Oudompheng21b9d142013-07-20 23:31:51 +0200645 x := ^uint32(0)
Russ Cox8f699a32011-09-28 12:00:31 -0400646 for i := range text {
Rémy Oudompheng21b9d142013-07-20 23:31:51 +0200647 x += x
648 x ^= 1
649 if int32(x) < 0 {
650 x ^= 0x88888eef
651 }
652 if x%31 == 0 {
Russ Cox8f699a32011-09-28 12:00:31 -0400653 text[i] = '\n'
654 } else {
Rémy Oudompheng21b9d142013-07-20 23:31:51 +0200655 text[i] = byte(x%(0x7E+1-0x20) + 0x20)
Russ Cox8f699a32011-09-28 12:00:31 -0400656 }
657 }
658 return text
659}
660
661func benchmark(b *testing.B, re string, n int) {
662 r := MustCompile(re)
663 t := makeText(n)
664 b.ResetTimer()
665 b.SetBytes(int64(n))
666 for i := 0; i < b.N; i++ {
667 if r.Match(t) {
Rob Pike6b772462011-12-20 10:36:25 -0800668 b.Fatal("match!")
Russ Cox8f699a32011-09-28 12:00:31 -0400669 }
670 }
671}
672
Russ Cox8f699a32011-09-28 12:00:31 -0400673const (
674 easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
675 easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
676 medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
677 hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
678 parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" +
679 "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
680)
681
Russ Cox2f2cc242011-12-07 15:03:05 -0500682func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) }
683func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) }
684func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
685func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
686func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
687func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
688func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
689func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
690func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) }
691func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) }
Brad Fitzpatrickc4aa9c52013-08-29 13:55:30 -0700692func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 32<<0) }
Russ Cox2f2cc242011-12-07 15:03:05 -0500693func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) }
694func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) }
695func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) }
696func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) }
697func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) }
698func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) }
699func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) }
700func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) }
701func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) }
Andrew Gerrandf41ffc22013-02-04 15:28:55 +1100702
703func TestLongest(t *testing.T) {
704 re, err := Compile(`a(|b)`)
705 if err != nil {
706 t.Fatal(err)
707 }
708 if g, w := re.FindString("ab"), "a"; g != w {
709 t.Errorf("first match was %q, want %q", g, w)
710 }
711 re.Longest()
712 if g, w := re.FindString("ab"), "ab"; g != w {
713 t.Errorf("longest match was %q, want %q", g, w)
714 }
715}
Matthew Brennana5130882015-04-03 20:09:53 -0400716
Michael Matloob71e83b82015-06-14 09:57:46 -0700717// TestProgramTooLongForBacktrack tests that a regex which is too long
Matthew Brennana5130882015-04-03 20:09:53 -0400718// for the backtracker still executes properly.
719func TestProgramTooLongForBacktrack(t *testing.T) {
720 longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`)
721 if !longRegex.MatchString("two") {
722 t.Errorf("longRegex.MatchString(\"two\") was false, want true")
723 }
724 if longRegex.MatchString("xxx") {
725 t.Errorf("longRegex.MatchString(\"xxx\") was true, want false")
726 }
727}