exp/regexp: add CompilePOSIX, more tests R=r CC=golang-dev https://golang.org/cl/4967060

commit: 21e671dee6a56622649edfd33377505e9669ea8f [log] [tgz]
author: Russ Cox <rsc@golang.org> Thu Sep 08 14:49:51 2011 -0400
committer: Russ Cox <rsc@golang.org> Thu Sep 08 14:49:51 2011 -0400
tree: 7006b4f2ff0aa0ee3f917d2ded35c613849a62ba
parent: 177dca77e12e0c4add014801dd8e993ef56a2c66 [diff] [blame]
diff --git a/src/pkg/exp/regexp/exec_test.go b/src/pkg/exp/regexp/exec_test.go
index 69f673a..b6d9ece 100644
--- a/src/pkg/exp/regexp/exec_test.go
+++ b/src/pkg/exp/regexp/exec_test.go

@@ -6,9 +6,12 @@
 
 import (
 	"bufio"
-	"compress/gzip"
+	"compress/bzip2"
+	"exp/regexp/syntax"
 	"fmt"
+	"io"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
@@ -59,24 +62,34 @@
 // At time of writing, re2.txt is 32 MB but compresses to 760 kB,
 // so we store re2.txt.gz in the repository and decompress it on the fly.
 //
-func TestRE2(t *testing.T) {
+func TestRE2Search(t *testing.T) {
+	testRE2(t, "testdata/re2-search.txt")
+}
+
+func TestRE2Exhaustive(t *testing.T) {
 	if testing.Short() {
-		t.Log("skipping TestRE2 during short test")
+		t.Log("skipping TestRE2Exhaustive during short test")
 		return
 	}
+	testRE2(t, "testdata/re2-exhaustive.txt.bz2")
+}
 
-	f, err := os.Open("re2.txt.gz")
+func testRE2(t *testing.T, file string) {
+	f, err := os.Open(file)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
-	gz, err := gzip.NewReader(f)
-	if err != nil {
-		t.Fatalf("decompress re2.txt.gz: %v", err)
+	var txt io.Reader
+	if strings.HasSuffix(file, ".bz2") {
+		z := bzip2.NewReader(f)
+		txt = z
+		file = file[:len(file)-len(".bz2")] // for error messages
+	} else {
+		txt = f
 	}
-	defer gz.Close()
 	lineno := 0
-	r := bufio.NewReader(gz)
+	r := bufio.NewReader(txt)
 	var (
 		str       []string
 		input     []string
@@ -92,13 +105,13 @@
 			if err == os.EOF {
 				break
 			}
-			t.Fatalf("re2.txt:%d: %v", lineno, err)
+			t.Fatalf("%s:%d: %v", file, lineno, err)
 		}
 		line = line[:len(line)-1] // chop \n
 		lineno++
 		switch {
 		case line == "":
-			t.Fatalf("re2.txt:%d: unexpected blank line", lineno)
+			t.Fatalf("%s:%d: unexpected blank line", file, lineno)
 		case line[0] == '#':
 			continue
 		case 'A' <= line[0] && line[0] <= 'Z':
@@ -114,7 +127,7 @@
 			q, err := strconv.Unquote(line)
 			if err != nil {
 				// Fatal because we'll get out of sync.
-				t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err)
+				t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
 			}
 			if inStrings {
 				str = append(str, q)
@@ -122,7 +135,7 @@
 			}
 			// Is a regexp.
 			if len(input) != 0 {
-				t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q)
+				t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
 			}
 			re, err = tryCompile(q)
 			if err != nil {
@@ -130,7 +143,7 @@
 					// We don't and likely never will support \C; keep going.
 					continue
 				}
-				t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err)
+				t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
 				if nfail++; nfail >= 100 {
 					t.Fatalf("stopping after %d errors", nfail)
 				}
@@ -140,7 +153,7 @@
 			refull, err = tryCompile(full)
 			if err != nil {
 				// Fatal because q worked, so this should always work.
-				t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err)
+				t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
 			}
 			input = str
 		case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
@@ -151,7 +164,7 @@
 				continue
 			}
 			if len(input) == 0 {
-				t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno)
+				t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
 			}
 			var text string
 			text, input = input[0], input[1:]
@@ -165,13 +178,13 @@
 			}
 			res := strings.Split(line, ";")
 			if len(res) != len(run) {
-				t.Fatalf("re2.txt:%d: have %d test results, want %d", lineno, len(res), len(run))
+				t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
 			}
 			for i := range res {
 				have, suffix := run[i](re, refull, text)
-				want := parseResult(t, lineno, res[i])
+				want := parseResult(t, file, lineno, res[i])
 				if !same(have, want) {
-					t.Errorf("re2.txt:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, suffix, text, have, want)
+					t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
 					if nfail++; nfail >= 100 {
 						t.Fatalf("stopping after %d errors", nfail)
 					}
@@ -179,7 +192,7 @@
 				}
 				b, suffix := match[i](re, refull, text)
 				if b != (want != nil) {
-					t.Errorf("re2.txt:%d: %#q%s.MatchString(%#q) = %v, want %v", lineno, re, suffix, text, b, !b)
+					t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
 					if nfail++; nfail >= 100 {
 						t.Fatalf("stopping after %d errors", nfail)
 					}
@@ -188,11 +201,11 @@
 			}
 
 		default:
-			t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line)
+			t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
 		}
 	}
 	if len(input) != 0 {
-		t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input))
+		t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
 	}
 	t.Logf("%d cases tested", ncase)
 }
@@ -270,7 +283,7 @@
 	return Compile(s)
 }
 
-func parseResult(t *testing.T, lineno int, res string) []int {
+func parseResult(t *testing.T, file string, lineno int, res string) []int {
 	// A single - indicates no match.
 	if res == "-" {
 		return nil
@@ -295,12 +308,12 @@
 			} else {
 				k := strings.Index(pair, "-")
 				if k < 0 {
-					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
+					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
 				}
 				lo, err1 := strconv.Atoi(pair[:k])
 				hi, err2 := strconv.Atoi(pair[k+1:])
 				if err1 != nil || err2 != nil || lo > hi {
-					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
+					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
 				}
 				out[n] = lo
 				out[n+1] = hi
@@ -323,3 +336,314 @@
 	}
 	return true
 }
+
+// TestFowler runs this package's regexp API against the
+// POSIX regular expression tests collected by Glenn Fowler
+// at http://www2.research.att.com/~gsf/testregex/.
+func TestFowler(t *testing.T) {
+	files, err := filepath.Glob("testdata/*.dat")
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, file := range files {
+		t.Log(file)
+		testFowler(t, file)
+	}
+}
+
+var notab = MustCompile(`[^\t]+`)
+
+func testFowler(t *testing.T, file string) {
+	f, err := os.Open(file)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	defer f.Close()
+	b := bufio.NewReader(f)
+	lineno := 0
+	lastRegexp := ""
+Reading:
+	for {
+		lineno++
+		line, err := b.ReadString('\n')
+		if err != nil {
+			if err != os.EOF {
+				t.Errorf("%s:%d: %v", file, lineno, err)
+			}
+			break Reading
+		}
+
+		// http://www2.research.att.com/~gsf/man/man1/testregex.html
+		//
+		// INPUT FORMAT
+		//   Input lines may be blank, a comment beginning with #, or a test
+		//   specification. A specification is five fields separated by one
+		//   or more tabs. NULL denotes the empty string and NIL denotes the
+		//   0 pointer.
+		if line[0] == '#' || line[0] == '\n' {
+			continue Reading
+		}
+		line = line[:len(line)-1]
+		field := notab.FindAllString(line, -1)
+		for i, f := range field {
+			if f == "NULL" {
+				field[i] = ""
+			}
+			if f == "NIL" {
+				t.Logf("%s:%d: skip: %s", file, lineno, line)
+				continue Reading
+			}
+		}
+		if len(field) == 0 {
+			continue Reading
+		}
+
+		//   Field 1: the regex(3) flags to apply, one character per REG_feature
+		//   flag. The test is skipped if REG_feature is not supported by the
+		//   implementation. If the first character is not [BEASKLP] then the
+		//   specification is a global control line. One or more of [BEASKLP] may be
+		//   specified; the test will be repeated for each mode.
+		// 
+		//     B 	basic			BRE	(grep, ed, sed)
+		//     E 	REG_EXTENDED		ERE	(egrep)
+		//     A	REG_AUGMENTED		ARE	(egrep with negation)
+		//     S	REG_SHELL		SRE	(sh glob)
+		//     K	REG_SHELL|REG_AUGMENTED	KRE	(ksh glob)
+		//     L	REG_LITERAL		LRE	(fgrep)
+		// 
+		//     a	REG_LEFT|REG_RIGHT	implicit ^...$
+		//     b	REG_NOTBOL		lhs does not match ^
+		//     c	REG_COMMENT		ignore space and #...\n
+		//     d	REG_SHELL_DOT		explicit leading . match
+		//     e	REG_NOTEOL		rhs does not match $
+		//     f	REG_MULTIPLE		multiple \n separated patterns
+		//     g	FNM_LEADING_DIR		testfnmatch only -- match until /
+		//     h	REG_MULTIREF		multiple digit backref
+		//     i	REG_ICASE		ignore case
+		//     j	REG_SPAN		. matches \n
+		//     k	REG_ESCAPE		\ to ecape [...] delimiter
+		//     l	REG_LEFT		implicit ^...
+		//     m	REG_MINIMAL		minimal match
+		//     n	REG_NEWLINE		explicit \n match
+		//     o	REG_ENCLOSED		(|&) magic inside [@|&](...)
+		//     p	REG_SHELL_PATH		explicit / match
+		//     q	REG_DELIMITED		delimited pattern
+		//     r	REG_RIGHT		implicit ...$
+		//     s	REG_SHELL_ESCAPED	\ not special
+		//     t	REG_MUSTDELIM		all delimiters must be specified
+		//     u	standard unspecified behavior -- errors not counted
+		//     v	REG_CLASS_ESCAPE	\ special inside [...]
+		//     w	REG_NOSUB		no subexpression match array
+		//     x	REG_LENIENT		let some errors slide
+		//     y	REG_LEFT		regexec() implicit ^...
+		//     z	REG_NULL		NULL subexpressions ok
+		//     $	                        expand C \c escapes in fields 2 and 3
+		//     /	                        field 2 is a regsubcomp() expression
+		//     =	                        field 3 is a regdecomp() expression
+		// 
+		//   Field 1 control lines:
+		// 
+		//     C		set LC_COLLATE and LC_CTYPE to locale in field 2
+		// 
+		//     ?test ...	output field 5 if passed and != EXPECTED, silent otherwise
+		//     &test ...	output field 5 if current and previous passed
+		//     |test ...	output field 5 if current passed and previous failed
+		//     ; ...	output field 2 if previous failed
+		//     {test ...	skip if failed until }
+		//     }		end of skip
+		// 
+		//     : comment		comment copied as output NOTE
+		//     :comment:test	:comment: ignored
+		//     N[OTE] comment	comment copied as output NOTE
+		//     T[EST] comment	comment
+		// 
+		//     number		use number for nmatch (20 by default)
+		flag := field[0]
+		switch flag[0] {
+		case '?', '&', '|', ';', '{', '}':
+			// Ignore all the control operators.
+			// Just run everything.
+			flag = flag[1:]
+			if flag == "" {
+				continue Reading
+			}
+		case ':':
+			i := strings.Index(flag[1:], ":")
+			if i < 0 {
+				t.Logf("skip: %s", line)
+				continue Reading
+			}
+			flag = flag[1+i+1:]
+		case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			t.Logf("skip: %s", line)
+			continue Reading
+		}
+
+		// Can check field count now that we've handled the myriad comment formats.
+		if len(field) < 4 {
+			t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
+			continue Reading
+		}
+
+		// Expand C escapes (a.k.a. Go escapes).
+		if strings.Contains(flag, "$") {
+			f := `"` + field[1] + `"`
+			if field[1], err = strconv.Unquote(f); err != nil {
+				t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
+			}
+			f = `"` + field[2] + `"`
+			if field[2], err = strconv.Unquote(f); err != nil {
+				t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
+			}
+		}
+
+		//   Field 2: the regular expression pattern; SAME uses the pattern from
+		//     the previous specification.
+		// 
+		if field[1] == "SAME" {
+			field[1] = lastRegexp
+		}
+		lastRegexp = field[1]
+
+		//   Field 3: the string to match.
+		text := field[2]
+
+		//   Field 4: the test outcome...
+		ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
+		if !ok {
+			t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
+			continue Reading
+		}
+
+		//   Field 5: optional comment appended to the report.
+
+	Testing:
+		// Run test once for each specified capital letter mode that we support.
+		for _, c := range flag {
+			pattern := field[1]
+			syn := syntax.POSIX | syntax.ClassNL
+			switch c {
+			default:
+				continue Testing
+			case 'E':
+				// extended regexp (what we support)
+			case 'L':
+				// literal
+				pattern = QuoteMeta(pattern)
+			}
+
+			for _, c := range flag {
+				switch c {
+				case 'i':
+					syn |= syntax.FoldCase
+				}
+			}
+
+			re, err := compile(pattern, syn, true)
+			if err != nil {
+				if shouldCompile {
+					t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
+				}
+				continue Testing
+			}
+			if !shouldCompile {
+				t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
+				continue Testing
+			}
+			match := re.MatchString(text)
+			if match != shouldMatch {
+				t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
+				continue Testing
+			}
+			have := re.FindStringSubmatchIndex(text)
+			if (len(have) > 0) != match {
+				t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, text, have)
+				continue Testing
+			}
+			if len(have) > len(pos) {
+				have = have[:len(pos)]
+			}
+			if !same(have, pos) {
+				t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
+			}
+		}
+	}
+}
+
+func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
+	//   Field 4: the test outcome. This is either one of the posix error
+	//     codes (with REG_ omitted) or the match array, a list of (m,n)
+	//     entries with m and n being first and last+1 positions in the
+	//     field 3 string, or NULL if REG_NOSUB is in effect and success
+	//     is expected. BADPAT is acceptable in place of any regcomp(3)
+	//     error code. The match[] array is initialized to (-2,-2) before
+	//     each test. All array elements from 0 to nmatch-1 must be specified
+	//     in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
+	//     Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
+	//     matched (?{...}) expression, where x is the text enclosed by {...},
+	//     o is the expression ordinal counting from 1, and n is the length of
+	//     the unmatched portion of the subject string. If x starts with a
+	//     number then that is the return value of re_execf(), otherwise 0 is
+	//     returned.
+	switch {
+	case s == "":
+		// Match with no position information.
+		ok = true
+		compiled = true
+		matched = true
+		return
+	case s == "NOMATCH":
+		// Match failure.
+		ok = true
+		compiled = true
+		matched = false
+		return
+	case 'A' <= s[0] && s[0] <= 'Z':
+		// All the other error codes are compile errors.
+		ok = true
+		compiled = false
+		return
+	}
+	compiled = true
+
+	var x []int
+	for s != "" {
+		var end byte = ')'
+		if len(x)%2 == 0 {
+			if s[0] != '(' {
+				ok = false
+				return
+			}
+			s = s[1:]
+			end = ','
+		}
+		i := 0
+		for i < len(s) && s[i] != end {
+			i++
+		}
+		if i == 0 || i == len(s) {
+			ok = false
+			return
+		}
+		var v = -1
+		var err os.Error
+		if s[:i] != "?" {
+			v, err = strconv.Atoi(s[:i])
+			if err != nil {
+				ok = false
+				return
+			}
+		}
+		x = append(x, v)
+		s = s[i+1:]
+	}
+	if len(x)%2 != 0 {
+		ok = false
+		return
+	}
+	ok = true
+	matched = true
+	pos = x
+	return
+}
commit	21e671dee6a56622649edfd33377505e9669ea8f	[log] [tgz]
author	Russ Cox <rsc@golang.org>	Thu Sep 08 14:49:51 2011 -0400
committer	Russ Cox <rsc@golang.org>	Thu Sep 08 14:49:51 2011 -0400
tree	7006b4f2ff0aa0ee3f917d2ded35c613849a62ba
parent	177dca77e12e0c4add014801dd8e993ef56a2c66 [diff] [blame]