blob: 669b80e0bc2757d7503a3394b9dcc12722d36143 [file] [log] [blame]
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"bufio"
"compress/bzip2"
"fmt"
"io"
"os"
"path/filepath"
"regexp/syntax"
"strconv"
"strings"
"testing"
"unicode/utf8"
)
// TestRE2 tests this package's regexp API against test cases
// considered during RE2's exhaustive tests, which run all possible
// regexps over a given set of atoms and operators, up to a given
// complexity, over all possible strings over a given alphabet,
// up to a given size. Rather than try to link with RE2, we read a
// log file containing the test cases and the expected matches.
// The log file, re2-exhaustive.txt, is generated by running 'make log'
// in the open source RE2 distribution https://github.com/google/re2/.
//
// The test file format is a sequence of stanzas like:
//
// strings
// "abc"
// "123x"
// regexps
// "[a-z]+"
// 0-3;0-3
// -;-
// "([0-9])([0-9])([0-9])"
// -;-
// -;0-3 0-1 1-2 2-3
//
// The stanza begins by defining a set of strings, quoted
// using Go double-quote syntax, one per line. Then the
// regexps section gives a sequence of regexps to run on
// the strings. In the block that follows a regexp, each line
// gives the semicolon-separated match results of running
// the regexp on the corresponding string.
// Each match result is either a single -, meaning no match, or a
// space-separated sequence of pairs giving the match and
// submatch indices. An unmatched subexpression formats
// its pair as a single - (not illustrated above). For now
// each regexp run produces two match results, one for a
// ``full match'' that restricts the regexp to matching the entire
// string or nothing, and one for a ``partial match'' that gives
// the leftmost first match found in the string.
//
// Lines beginning with # are comments. Lines beginning with
// a capital letter are test names printed during RE2's test suite
// and are echoed into t but otherwise ignored.
//
// At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB,
// so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly.
//
func TestRE2Search(t *testing.T) {
testRE2(t, "testdata/re2-search.txt")
}
func testRE2(t *testing.T, file string) {
f, err := os.Open(file)
if err != nil {
t.Fatal(err)
}
defer f.Close()
var txt io.Reader
if strings.HasSuffix(file, ".bz2") {
z := bzip2.NewReader(f)
txt = z
file = file[:len(file)-len(".bz2")] // for error messages
} else {
txt = f
}
lineno := 0
scanner := bufio.NewScanner(txt)
var (
str []string
input []string
inStrings bool
re *Regexp
refull *Regexp
nfail int
ncase int
)
for lineno := 1; scanner.Scan(); lineno++ {
line := scanner.Text()
switch {
case line == "":
t.Fatalf("%s:%d: unexpected blank line", file, lineno)
case line[0] == '#':
continue
case 'A' <= line[0] && line[0] <= 'Z':
// Test name.
t.Logf("%s\n", line)
continue
case line == "strings":
str = str[:0]
inStrings = true
case line == "regexps":
inStrings = false
case line[0] == '"':
q, err := strconv.Unquote(line)
if err != nil {
// Fatal because we'll get out of sync.
t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
}
if inStrings {
str = append(str, q)
continue
}
// Is a regexp.
if len(input) != 0 {
t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
}
re, err = tryCompile(q)
if err != nil {
if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
// We don't and likely never will support \C; keep going.
continue
}
t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
full := `\A(?:` + q + `)\z`
refull, err = tryCompile(full)
if err != nil {
// Fatal because q worked, so this should always work.
t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
}
input = str
case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
// A sequence of match results.
ncase++
if re == nil {
// Failed to compile: skip results.
continue
}
if len(input) == 0 {
t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
}
var text string
text, input = input[0], input[1:]
if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
// RE2's \B considers every byte position,
// so it sees 'not word boundary' in the
// middle of UTF-8 sequences. This package
// only considers the positions between runes,
// so it disagrees. Skip those cases.
continue
}
res := strings.Split(line, ";")
if len(res) != len(run) {
t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
}
for i := range res {
have, suffix := run[i](re, refull, text)
want := parseResult(t, file, lineno, res[i])
if !same(have, want) {
t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
b, suffix := match[i](re, refull, text)
if b != (want != nil) {
t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
}
default:
t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
}
}
if err := scanner.Err(); err != nil {
t.Fatalf("%s:%d: %v", file, lineno, err)
}
if len(input) != 0 {
t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
}
t.Logf("%d cases tested", ncase)
}
var run = []func(*Regexp, *Regexp, string) ([]int, string){
runFull,
runPartial,
runFullLongest,
runPartialLongest,
}
func runFull(re, refull *Regexp, text string) ([]int, string) {
refull.longest = false
return refull.FindStringSubmatchIndex(text), "[full]"
}
func runPartial(re, refull *Regexp, text string) ([]int, string) {
re.longest = false
return re.FindStringSubmatchIndex(text), ""
}
func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
refull.longest = true
return refull.FindStringSubmatchIndex(text), "[full,longest]"
}
func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
re.longest = true
return re.FindStringSubmatchIndex(text), "[longest]"
}
var match = []func(*Regexp, *Regexp, string) (bool, string){
matchFull,
matchPartial,
matchFullLongest,
matchPartialLongest,
}
func matchFull(re, refull *Regexp, text string) (bool, string) {
refull.longest = false
return refull.MatchString(text), "[full]"
}
func matchPartial(re, refull *Regexp, text string) (bool, string) {
re.longest = false
return re.MatchString(text), ""
}
func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
refull.longest = true
return refull.MatchString(text), "[full,longest]"
}
func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
re.longest = true
return re.MatchString(text), "[longest]"
}
func isSingleBytes(s string) bool {
for _, c := range s {
if c >= utf8.RuneSelf {
return false
}
}
return true
}
func tryCompile(s string) (re *Regexp, err error) {
// Protect against panic during Compile.
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic: %v", r)
}
}()
return Compile(s)
}
func parseResult(t *testing.T, file string, lineno int, res string) []int {
// A single - indicates no match.
if res == "-" {
return nil
}
// Otherwise, a space-separated list of pairs.
n := 1
for j := 0; j < len(res); j++ {
if res[j] == ' ' {
n++
}
}
out := make([]int, 2*n)
i := 0
n = 0
for j := 0; j <= len(res); j++ {
if j == len(res) || res[j] == ' ' {
// Process a single pair. - means no submatch.
pair := res[i:j]
if pair == "-" {
out[n] = -1
out[n+1] = -1
} else {
k := strings.Index(pair, "-")
if k < 0 {
t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
}
lo, err1 := strconv.Atoi(pair[:k])
hi, err2 := strconv.Atoi(pair[k+1:])
if err1 != nil || err2 != nil || lo > hi {
t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
}
out[n] = lo
out[n+1] = hi
}
n += 2
i = j + 1
}
}
return out
}
func same(x, y []int) bool {
if len(x) != len(y) {
return false
}
for i, xi := range x {
if xi != y[i] {
return false
}
}
return true
}
// TestFowler runs this package's regexp API against the
// POSIX regular expression tests collected by Glenn Fowler
// at http://www2.research.att.com/~astopen/testregex/testregex.html.
func TestFowler(t *testing.T) {
files, err := filepath.Glob("testdata/*.dat")
if err != nil {
t.Fatal(err)
}
for _, file := range files {
t.Log(file)
testFowler(t, file)
}
}
var notab = MustCompilePOSIX(`[^\t]+`)
func testFowler(t *testing.T, file string) {
f, err := os.Open(file)
if err != nil {
t.Error(err)
return
}
defer f.Close()
b := bufio.NewReader(f)
lineno := 0
lastRegexp := ""
Reading:
for {
lineno++
line, err := b.ReadString('\n')
if err != nil {
if err != io.EOF {
t.Errorf("%s:%d: %v", file, lineno, err)
}
break Reading
}
// http://www2.research.att.com/~astopen/man/man1/testregex.html
//
// INPUT FORMAT
// Input lines may be blank, a comment beginning with #, or a test
// specification. A specification is five fields separated by one
// or more tabs. NULL denotes the empty string and NIL denotes the
// 0 pointer.
if line[0] == '#' || line[0] == '\n' {
continue Reading
}
line = line[:len(line)-1]
field := notab.FindAllString(line, -1)
for i, f := range field {
if f == "NULL" {
field[i] = ""
}
if f == "NIL" {
t.Logf("%s:%d: skip: %s", file, lineno, line)
continue Reading
}
}
if len(field) == 0 {
continue Reading
}
// Field 1: the regex(3) flags to apply, one character per REG_feature
// flag. The test is skipped if REG_feature is not supported by the
// implementation. If the first character is not [BEASKLP] then the
// specification is a global control line. One or more of [BEASKLP] may be
// specified; the test will be repeated for each mode.
//
// B basic BRE (grep, ed, sed)
// E REG_EXTENDED ERE (egrep)
// A REG_AUGMENTED ARE (egrep with negation)
// S REG_SHELL SRE (sh glob)
// K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
// L REG_LITERAL LRE (fgrep)
//
// a REG_LEFT|REG_RIGHT implicit ^...$
// b REG_NOTBOL lhs does not match ^
// c REG_COMMENT ignore space and #...\n
// d REG_SHELL_DOT explicit leading . match
// e REG_NOTEOL rhs does not match $
// f REG_MULTIPLE multiple \n separated patterns
// g FNM_LEADING_DIR testfnmatch only -- match until /
// h REG_MULTIREF multiple digit backref
// i REG_ICASE ignore case
// j REG_SPAN . matches \n
// k REG_ESCAPE \ to ecape [...] delimiter
// l REG_LEFT implicit ^...
// m REG_MINIMAL minimal match
// n REG_NEWLINE explicit \n match
// o REG_ENCLOSED (|&) magic inside [@|&](...)
// p REG_SHELL_PATH explicit / match
// q REG_DELIMITED delimited pattern
// r REG_RIGHT implicit ...$
// s REG_SHELL_ESCAPED \ not special
// t REG_MUSTDELIM all delimiters must be specified
// u standard unspecified behavior -- errors not counted
// v REG_CLASS_ESCAPE \ special inside [...]
// w REG_NOSUB no subexpression match array
// x REG_LENIENT let some errors slide
// y REG_LEFT regexec() implicit ^...
// z REG_NULL NULL subexpressions ok
// $ expand C \c escapes in fields 2 and 3
// / field 2 is a regsubcomp() expression
// = field 3 is a regdecomp() expression
//
// Field 1 control lines:
//
// C set LC_COLLATE and LC_CTYPE to locale in field 2
//
// ?test ... output field 5 if passed and != EXPECTED, silent otherwise
// &test ... output field 5 if current and previous passed
// |test ... output field 5 if current passed and previous failed
// ; ... output field 2 if previous failed
// {test ... skip if failed until }
// } end of skip
//
// : comment comment copied as output NOTE
// :comment:test :comment: ignored
// N[OTE] comment comment copied as output NOTE
// T[EST] comment comment
//
// number use number for nmatch (20 by default)
flag := field[0]
switch flag[0] {
case '?', '&', '|', ';', '{', '}':
// Ignore all the control operators.
// Just run everything.
flag = flag[1:]
if flag == "" {
continue Reading
}
case ':':
i := strings.Index(flag[1:], ":")
if i < 0 {
t.Logf("skip: %s", line)
continue Reading
}
flag = flag[1+i+1:]
case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
t.Logf("skip: %s", line)
continue Reading
}
// Can check field count now that we've handled the myriad comment formats.
if len(field) < 4 {
t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
continue Reading
}
// Expand C escapes (a.k.a. Go escapes).
if strings.Contains(flag, "$") {
f := `"` + field[1] + `"`
if field[1], err = strconv.Unquote(f); err != nil {
t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
}
f = `"` + field[2] + `"`
if field[2], err = strconv.Unquote(f); err != nil {
t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
}
}
// Field 2: the regular expression pattern; SAME uses the pattern from
// the previous specification.
//
if field[1] == "SAME" {
field[1] = lastRegexp
}
lastRegexp = field[1]
// Field 3: the string to match.
text := field[2]
// Field 4: the test outcome...
ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
if !ok {
t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
continue Reading
}
// Field 5: optional comment appended to the report.
Testing:
// Run test once for each specified capital letter mode that we support.
for _, c := range flag {
pattern := field[1]
syn := syntax.POSIX | syntax.ClassNL
switch c {
default:
continue Testing
case 'E':
// extended regexp (what we support)
case 'L':
// literal
pattern = QuoteMeta(pattern)
}
for _, c := range flag {
switch c {
case 'i':
syn |= syntax.FoldCase
}
}
re, err := compile(pattern, syn, true)
if err != nil {
if shouldCompile {
t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
}
continue Testing
}
if !shouldCompile {
t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
continue Testing
}
match := re.MatchString(text)
if match != shouldMatch {
t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
continue Testing
}
have := re.FindStringSubmatchIndex(text)
if (len(have) > 0) != match {
t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
continue Testing
}
if len(have) > len(pos) {
have = have[:len(pos)]
}
if !same(have, pos) {
t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
}
}
}
}
func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
// Field 4: the test outcome. This is either one of the posix error
// codes (with REG_ omitted) or the match array, a list of (m,n)
// entries with m and n being first and last+1 positions in the
// field 3 string, or NULL if REG_NOSUB is in effect and success
// is expected. BADPAT is acceptable in place of any regcomp(3)
// error code. The match[] array is initialized to (-2,-2) before
// each test. All array elements from 0 to nmatch-1 must be specified
// in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
// Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
// matched (?{...}) expression, where x is the text enclosed by {...},
// o is the expression ordinal counting from 1, and n is the length of
// the unmatched portion of the subject string. If x starts with a
// number then that is the return value of re_execf(), otherwise 0 is
// returned.
switch {
case s == "":
// Match with no position information.
ok = true
compiled = true
matched = true
return
case s == "NOMATCH":
// Match failure.
ok = true
compiled = true
matched = false
return
case 'A' <= s[0] && s[0] <= 'Z':
// All the other error codes are compile errors.
ok = true
compiled = false
return
}
compiled = true
var x []int
for s != "" {
var end byte = ')'
if len(x)%2 == 0 {
if s[0] != '(' {
ok = false
return
}
s = s[1:]
end = ','
}
i := 0
for i < len(s) && s[i] != end {
i++
}
if i == 0 || i == len(s) {
ok = false
return
}
var v = -1
var err error
if s[:i] != "?" {
v, err = strconv.Atoi(s[:i])
if err != nil {
ok = false
return
}
}
x = append(x, v)
s = s[i+1:]
}
if len(x)%2 != 0 {
ok = false
return
}
ok = true
matched = true
pos = x
return
}
var text []byte
func makeText(n int) []byte {
if len(text) >= n {
return text[:n]
}
text = make([]byte, n)
x := ^uint32(0)
for i := range text {
x += x
x ^= 1
if int32(x) < 0 {
x ^= 0x88888eef
}
if x%31 == 0 {
text[i] = '\n'
} else {
text[i] = byte(x%(0x7E+1-0x20) + 0x20)
}
}
return text
}
func benchmark(b *testing.B, re string, n int) {
r := MustCompile(re)
t := makeText(n)
b.ResetTimer()
b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
if r.Match(t) {
b.Fatal("match!")
}
}
}
const (
easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" +
"(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
)
func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) }
func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) }
func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) }
func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) }
func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 32<<0) }
func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) }
func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) }
func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) }
func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) }
func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) }
func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) }
func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) }
func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) }
func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) }
func TestLongest(t *testing.T) {
re, err := Compile(`a(|b)`)
if err != nil {
t.Fatal(err)
}
if g, w := re.FindString("ab"), "a"; g != w {
t.Errorf("first match was %q, want %q", g, w)
}
re.Longest()
if g, w := re.FindString("ab"), "ab"; g != w {
t.Errorf("longest match was %q, want %q", g, w)
}
}
// TestProgramTooLongForBacktrace tests that a regex which is too long
// for the backtracker still executes properly.
func TestProgramTooLongForBacktrack(t *testing.T) {
longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`)
if !longRegex.MatchString("two") {
t.Errorf("longRegex.MatchString(\"two\") was false, want true")
}
if longRegex.MatchString("xxx") {
t.Errorf("longRegex.MatchString(\"xxx\") was true, want false")
}
}