| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package regexp |
| |
| import ( |
| "bufio" |
| "compress/gzip" |
| "fmt" |
| "os" |
| "strconv" |
| "strings" |
| "testing" |
| "utf8" |
| ) |
| |
| // TestRE2 tests this package's regexp API against test cases |
| // considered during RE2's exhaustive tests, which run all possible |
| // regexps over a given set of atoms and operators, up to a given |
| // complexity, over all possible strings over a given alphabet, |
| // up to a given size. Rather than try to link with RE2, we read a |
| // log file containing the test cases and the expected matches. |
| // The log file, re2.txt, is generated by running 'make exhaustive-log' |
| // in the open source RE2 distribution. http://code.google.com/p/re2/ |
| // |
| // The test file format is a sequence of stanzas like: |
| // |
| // strings |
| // "abc" |
| // "123x" |
| // regexps |
| // "[a-z]+" |
| // 0-3;0-3 |
| // -;- |
| // "([0-9])([0-9])([0-9])" |
| // -;- |
| // -;0-3 0-1 1-2 2-3 |
| // |
| // The stanza begins by defining a set of strings, quoted |
| // using Go double-quote syntax, one per line. Then the |
| // regexps section gives a sequence of regexps to run on |
| // the strings. In the block that follows a regexp, each line |
| // gives the semicolon-separated match results of running |
| // the regexp on the corresponding string. |
| // Each match result is either a single -, meaning no match, or a |
| // space-separated sequence of pairs giving the match and |
| // submatch indices. An unmatched subexpression formats |
| // its pair as a single - (not illustrated above). For now |
| // each regexp run produces two match results, one for a |
| // ``full match'' that restricts the regexp to matching the entire |
| // string or nothing, and one for a ``partial match'' that gives |
| // the leftmost first match found in the string. |
| // |
| // Lines beginning with # are comments. Lines beginning with |
| // a capital letter are test names printed during RE2's test suite |
| // and are echoed into t but otherwise ignored. |
| // |
| // At time of writing, re2.txt is 32 MB but compresses to 760 kB, |
| // so we store re2.txt.gz in the repository and decompress it on the fly. |
| // |
| func TestRE2(t *testing.T) { |
| if testing.Short() { |
| t.Log("skipping TestRE2 during short test") |
| return |
| } |
| |
| f, err := os.Open("re2.txt.gz") |
| if err != nil { |
| t.Fatal(err) |
| } |
| defer f.Close() |
| gz, err := gzip.NewReader(f) |
| if err != nil { |
| t.Fatalf("decompress re2.txt.gz: %v", err) |
| } |
| defer gz.Close() |
| lineno := 0 |
| r := bufio.NewReader(gz) |
| var ( |
| str []string |
| input []string |
| inStrings bool |
| re *Regexp |
| refull *Regexp |
| nfail int |
| ncase int |
| ) |
| for { |
| line, err := r.ReadString('\n') |
| if err != nil { |
| if err == os.EOF { |
| break |
| } |
| t.Fatalf("re2.txt:%d: %v", lineno, err) |
| } |
| line = line[:len(line)-1] // chop \n |
| lineno++ |
| switch { |
| case line == "": |
| t.Fatalf("re2.txt:%d: unexpected blank line", lineno) |
| case line[0] == '#': |
| continue |
| case 'A' <= line[0] && line[0] <= 'Z': |
| // Test name. |
| t.Logf("%s\n", line) |
| continue |
| case line == "strings": |
| str = str[:0] |
| inStrings = true |
| case line == "regexps": |
| inStrings = false |
| case line[0] == '"': |
| q, err := strconv.Unquote(line) |
| if err != nil { |
| // Fatal because we'll get out of sync. |
| t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err) |
| } |
| if inStrings { |
| str = append(str, q) |
| continue |
| } |
| // Is a regexp. |
| if len(input) != 0 { |
| t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q) |
| } |
| re, err = tryCompile(q) |
| if err != nil { |
| if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" { |
| // We don't and likely never will support \C; keep going. |
| continue |
| } |
| t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err) |
| if nfail++; nfail >= 100 { |
| t.Fatalf("stopping after %d errors", nfail) |
| } |
| continue |
| } |
| full := `\A(?:` + q + `)\z` |
| refull, err = tryCompile(full) |
| if err != nil { |
| // Fatal because q worked, so this should always work. |
| t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err) |
| } |
| input = str |
| case line[0] == '-' || '0' <= line[0] && line[0] <= '9': |
| // A sequence of match results. |
| ncase++ |
| if re == nil { |
| // Failed to compile: skip results. |
| continue |
| } |
| if len(input) == 0 { |
| t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno) |
| } |
| var text string |
| text, input = input[0], input[1:] |
| if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { |
| // RE2's \B considers every byte position, |
| // so it sees 'not word boundary' in the |
| // middle of UTF-8 sequences. This package |
| // only considers the positions between runes, |
| // so it disagrees. Skip those cases. |
| continue |
| } |
| res := strings.Split(line, ";") |
| if len(res) != 2 { |
| t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res)) |
| } |
| // res[0] is full match |
| // res[1] is partial match |
| // Run partial match first; don't bother with full if partial fails. |
| have := re.FindStringSubmatchIndex(text) |
| want := parseResult(t, lineno, res[1]) |
| if !same(have, want) { |
| t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want) |
| if nfail++; nfail >= 100 { |
| t.Fatalf("stopping after %d errors", nfail) |
| } |
| continue |
| } |
| have = refull.FindStringSubmatchIndex(text) |
| want = parseResult(t, lineno, res[0]) |
| if !same(have, want) { |
| t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want) |
| if nfail++; nfail >= 100 { |
| t.Fatalf("stopping after %d errors", nfail) |
| } |
| } |
| default: |
| t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line) |
| } |
| } |
| if len(input) != 0 { |
| t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input)) |
| } |
| t.Logf("%d cases tested", ncase) |
| } |
| |
| func isSingleBytes(s string) bool { |
| for _, c := range s { |
| if c >= utf8.RuneSelf { |
| return false |
| } |
| } |
| return true |
| } |
| |
| func tryCompile(s string) (re *Regexp, err os.Error) { |
| // Protect against panic during Compile. |
| defer func() { |
| if r := recover(); r != nil { |
| err = fmt.Errorf("panic: %v", r) |
| } |
| }() |
| return Compile(s) |
| } |
| |
| func parseResult(t *testing.T, lineno int, res string) []int { |
| // A single - indicates no match. |
| if res == "-" { |
| return nil |
| } |
| // Otherwise, a space-separated list of pairs. |
| n := 1 |
| for j := 0; j < len(res); j++ { |
| if res[j] == ' ' { |
| n++ |
| } |
| } |
| out := make([]int, 2*n) |
| i := 0 |
| n = 0 |
| for j := 0; j <= len(res); j++ { |
| if j == len(res) || res[j] == ' ' { |
| // Process a single pair. - means no submatch. |
| pair := res[i:j] |
| if pair == "-" { |
| out[n] = -1 |
| out[n+1] = -1 |
| } else { |
| k := strings.Index(pair, "-") |
| if k < 0 { |
| t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) |
| } |
| lo, err1 := strconv.Atoi(pair[:k]) |
| hi, err2 := strconv.Atoi(pair[k+1:]) |
| if err1 != nil || err2 != nil || lo > hi { |
| t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) |
| } |
| out[n] = lo |
| out[n+1] = hi |
| } |
| n += 2 |
| i = j + 1 |
| } |
| } |
| return out |
| } |
| |
| func same(x, y []int) bool { |
| if len(x) != len(y) { |
| return false |
| } |
| for i, xi := range x { |
| if xi != y[i] { |
| return false |
| } |
| } |
| return true |
| } |