exp/regexp: bug fixes and RE2 tests

Also add exp/regexp to build (forgot before).

At this point I am very confident in exp/regexp's
behavior.  It should be usable as a drop-in
replacement for regexp now.

Later CLs could introduce a CompilePOSIX
to get at traditional POSIX ``extended regular expressions''
as in egrep and also an re.MatchLongest method to
change the matching mode to leftmost longest
instead of leftmost first.  On the other hand, I expect
very few people to use either.

R=r, r, gustavo
CC=golang-dev
https://golang.org/cl/4990041
diff --git a/src/pkg/exp/regexp/exec_test.go b/src/pkg/exp/regexp/exec_test.go
new file mode 100644
index 0000000..15c4c53
--- /dev/null
+++ b/src/pkg/exp/regexp/exec_test.go
@@ -0,0 +1,271 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package regexp
+
+import (
+	"bufio"
+	"compress/gzip"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"utf8"
+)
+
+// TestRE2 tests this package's regexp API against test cases
+// considered during RE2's exhaustive tests, which run all possible
+// regexps over a given set of atoms and operators, up to a given
+// complexity, over all possible strings over a given alphabet,
+// up to a given size.  Rather than try to link with RE2, we read a
+// log file containing the test cases and the expected matches.
+// The log file, re2.txt, is generated by running 'make exhaustive-log'
+// in the open source RE2 distribution.  http://code.google.com/p/re2/
+//
+// The test file format is a sequence of stanzas like:
+//
+//	strings
+//	"abc"
+//	"123x"
+//	regexps
+//	"[a-z]+"
+//	0-3;0-3
+//	-;-
+//	"([0-9])([0-9])([0-9])"
+//	-;-
+//	-;0-3 0-1 1-2 2-3
+//
+// The stanza begins by defining a set of strings, quoted
+// using Go double-quote syntax, one per line.  Then the
+// regexps section gives a sequence of regexps to run on
+// the strings.  In the block that follows a regexp, each line
+// gives the semicolon-separated match results of running
+// the regexp on the corresponding string.
+// Each match result is either a single -, meaning no match, or a
+// space-separated sequence of pairs giving the match and
+// submatch indices.  An unmatched subexpression formats
+// its pair as a single - (not illustrated above).  For now
+// each regexp run produces two match results, one for a
+// ``full match'' that restricts the regexp to matching the entire
+// string or nothing, and one for a ``partial match'' that gives
+// the leftmost first match found in the string.
+//
+// Lines beginning with # are comments.  Lines beginning with
+// a capital letter are test names printed during RE2's test suite
+// and are echoed into t but otherwise ignored.
+//
+// At time of writing, re2.txt is 32 MB but compresses to 760 kB,
+// so we store re2.txt.gz in the repository and decompress it on the fly.
+//
+func TestRE2(t *testing.T) {
+	if testing.Short() {
+		t.Log("skipping TestRE2 during short test")
+		return
+	}
+
+	f, err := os.Open("re2.txt.gz")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+	gz, err := gzip.NewReader(f)
+	if err != nil {
+		t.Fatalf("decompress re2.txt.gz: %v", err)
+	}
+	defer gz.Close()
+	lineno := 0
+	r := bufio.NewReader(gz)
+	var (
+		str       []string
+		input     []string
+		inStrings bool
+		re        *Regexp
+		refull    *Regexp
+		nfail     int
+		ncase     int
+	)
+	for {
+		line, err := r.ReadString('\n')
+		if err != nil {
+			if err == os.EOF {
+				break
+			}
+			t.Fatalf("re2.txt:%d: %v", lineno, err)
+		}
+		line = line[:len(line)-1] // chop \n
+		lineno++
+		switch {
+		case line == "":
+			t.Fatalf("re2.txt:%d: unexpected blank line", lineno)
+		case line[0] == '#':
+			continue
+		case 'A' <= line[0] && line[0] <= 'Z':
+			// Test name.
+			t.Logf("%s\n", line)
+			continue
+		case line == "strings":
+			str = str[:0]
+			inStrings = true
+		case line == "regexps":
+			inStrings = false
+		case line[0] == '"':
+			q, err := strconv.Unquote(line)
+			if err != nil {
+				// Fatal because we'll get out of sync.
+				t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err)
+			}
+			if inStrings {
+				str = append(str, q)
+				continue
+			}
+			// Is a regexp.
+			if len(input) != 0 {
+				t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q)
+			}
+			re, err = tryCompile(q)
+			if err != nil {
+				if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" {
+					// We don't and likely never will support \C; keep going.
+					continue
+				}
+				t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err)
+				if nfail++; nfail >= 100 {
+					t.Fatalf("stopping after %d errors", nfail)
+				}
+				continue
+			}
+			full := `\A(?:` + q + `)\z`
+			refull, err = tryCompile(full)
+			if err != nil {
+				// Fatal because q worked, so this should always work.
+				t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err)
+			}
+			input = str
+		case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
+			// A sequence of match results.
+			ncase++
+			if re == nil {
+				// Failed to compile: skip results.
+				continue
+			}
+			if len(input) == 0 {
+				t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno)
+			}
+			var text string
+			text, input = input[0], input[1:]
+			if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
+				// RE2's \B considers every byte position,
+				// so it sees 'not word boundary' in the
+				// middle of UTF-8 sequences.  This package
+				// only considers the positions between runes,
+				// so it disagrees.  Skip those cases.
+				continue
+			}
+			res := strings.Split(line, ";")
+			if len(res) != 2 {
+				t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res))
+			}
+			// res[0] is full match
+			// res[1] is partial match
+			// Run partial match first; don't bother with full if partial fails.
+			have := re.FindStringSubmatchIndex(text)
+			want := parseResult(t, lineno, res[1])
+			if !same(have, want) {
+				t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want)
+				if nfail++; nfail >= 100 {
+					t.Fatalf("stopping after %d errors", nfail)
+				}
+				continue
+			}
+			have = refull.FindStringSubmatchIndex(text)
+			want = parseResult(t, lineno, res[0])
+			if !same(have, want) {
+				t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want)
+				if nfail++; nfail >= 100 {
+					t.Fatalf("stopping after %d errors", nfail)
+				}
+			}
+		default:
+			t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line)
+		}
+	}
+	if len(input) != 0 {
+		t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input))
+	}
+	t.Logf("%d cases tested", ncase)
+}
+
+func isSingleBytes(s string) bool {
+	for _, c := range s {
+		if c >= utf8.RuneSelf {
+			return false
+		}
+	}
+	return true
+}
+
+func tryCompile(s string) (re *Regexp, err os.Error) {
+	// Protect against panic during Compile.
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("panic: %v", r)
+		}
+	}()
+	return Compile(s)
+}
+
+func parseResult(t *testing.T, lineno int, res string) []int {
+	// A single - indicates no match.
+	if res == "-" {
+		return nil
+	}
+	// Otherwise, a space-separated list of pairs.
+	n := 1
+	for j := 0; j < len(res); j++ {
+		if res[j] == ' ' {
+			n++
+		}
+	}
+	out := make([]int, 2*n)
+	i := 0
+	n = 0
+	for j := 0; j <= len(res); j++ {
+		if j == len(res) || res[j] == ' ' {
+			// Process a single pair.  - means no submatch.
+			pair := res[i:j]
+			if pair == "-" {
+				out[n] = -1
+				out[n+1] = -1
+			} else {
+				k := strings.Index(pair, "-")
+				if k < 0 {
+					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
+				}
+				lo, err1 := strconv.Atoi(pair[:k])
+				hi, err2 := strconv.Atoi(pair[k+1:])
+				if err1 != nil || err2 != nil || lo > hi {
+					t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair)
+				}
+				out[n] = lo
+				out[n+1] = hi
+			}
+			n += 2
+			i = j + 1
+		}
+	}
+	return out
+}
+
+func same(x, y []int) bool {
+	if len(x) != len(y) {
+		return false
+	}
+	for i, xi := range x {
+		if xi != y[i] {
+			return false
+		}
+	}
+	return true
+}