implement matching
clean up interface equality hack

still needs more tests; checking in for gccgo testing

R=rsc
DELTA=304  (261 added, 14 deleted, 29 changed)
OCL=17128
CL=17135
diff --git a/usr/r/regexp/main.go b/usr/r/regexp/main.go
index d723d87..25ec07a 100644
--- a/usr/r/regexp/main.go
+++ b/usr/r/regexp/main.go
@@ -9,14 +9,131 @@
 	"regexp";
 )
 
-func main() {
-	str := "a*b*c*";
-	if sys.argc() > 1 {
-		str = sys.argv(1);
-	}
-	re, err := regexp.Compile(str);
-	if err != nil {
-		print("error: ", err.String(), "\n");
+var good_re = []string{
+	``
+,	`.`
+,	`^.$`
+,	`a`
+,	`a*`
+,	`a+`
+,	`a?`
+,	`a|b`
+,	`a*|b*`
+,	`(a*|b)(c*|d)`
+,	`[a-z]`
+,	`[a-abc-c\-\]\[]`
+,	`[a-z]+`
+,	`[]`
+,	`[abc]`
+,	`[^1234]`
+}
+
+// TODO: nice to do this with a map but we don't have an iterator
+type StringError struct {
+	re	string;
+	err	*os.Error;
+}
+var bad_re = []StringError{
+	StringError{ `*`,	 	regexp.ErrBareClosure },	
+	StringError{ `(abc`,	regexp.ErrUnmatchedLpar },	
+	StringError{ `abc)`,	regexp.ErrUnmatchedRpar },	
+	StringError{ `x[a-z`,	regexp.ErrUnmatchedLbkt },	
+	StringError{ `abc]`,	regexp.ErrUnmatchedRbkt },	
+	StringError{ `[z-a]`,	regexp.ErrBadRange },	
+	StringError{ `abc\`,	regexp.ErrExtraneousBackslash },	
+	StringError{ `a**`,	regexp.ErrBadClosure },	
+	StringError{ `a*+`,	regexp.ErrBadClosure },	
+	StringError{ `a??`,	regexp.ErrBadClosure },	
+	StringError{ `*`,	 	regexp.ErrBareClosure },	
+	StringError{ `\x`,	regexp.ErrBadBackslash }
+}
+
+type Vec [20]int;
+
+type Tester struct {
+	re	string;
+	text	string;
+	match	Vec;
+}
+
+var matches = []Tester {
+	Tester{ ``,	"",	Vec{0,0, -1,-1} },
+	Tester{ `a`,	"a",	Vec{0,1, -1,-1} },
+	Tester{ `b`,	"abc",	Vec{1,2, -1,-1} },
+	Tester{ `.`,	"a",	Vec{0,1, -1,-1} },
+	Tester{ `.*`,	"abcdef",	Vec{0,6, -1,-1} },
+	Tester{ `^abcd$`,	"abcd",	Vec{0,4, -1,-1} },
+	Tester{ `^bcd'`,	"abcdef",	Vec{-1,-1} },
+	Tester{ `^abcd$`,	"abcde",	Vec{-1,-1} },
+	Tester{ `a+`,	"baaab",	Vec{1, 4, -1,-1} },
+	Tester{ `a*`,	"baaab",	Vec{0, 0, -1,-1} }
+}
+
+func Compile(expr string, error *os.Error) regexp.Regexp {
+	re, err := regexp.Compile(expr);
+	if err != error {
+		print("compiling `", expr, "`; unexpected error: ", err.String(), "\n");
 		sys.exit(1);
 	}
+	return re
+}
+
+func MarkedLen(m *[] int) int {
+	if m == nil {
+		return 0
+	}
+	var i int;
+	for i = 0; i < len(m) && m[i] >= 0; i = i+2 {
+	}
+	return i
+}
+
+func PrintVec(m *[] int) {
+	l := MarkedLen(m);
+	for i := 0; i < l && m[i] >= 0; i = i+2 {
+		print(m[i], ",", m[i+1], " ")
+	}
+}
+
+func Equal(m1, m2 *[]int) bool {
+	l := MarkedLen(m1);
+	if l != MarkedLen(m2) {
+		return false
+	}
+	for i := 0; i < l; i++ {
+		if m1[i] != m2[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func Match(expr string, str string, match *[]int) {
+	re := Compile(expr, nil);
+	m := re.Execute(str);
+	if !Equal(m, match) {
+		print("failure on `", expr, "` matching `", str, "`:\n");
+		PrintVec(m);
+		print("\nshould be:\n");
+		PrintVec(match);
+		print("\n");
+		sys.exit(1);
+	}
+}
+
+func main() {
+	if sys.argc() > 1 {
+		Compile(sys.argv(1), nil);
+		sys.exit(0);
+	}
+	for i := 0; i < len(good_re); i++ {
+		Compile(good_re[i], nil);
+	}
+	for i := 0; i < len(bad_re); i++ {
+		Compile(bad_re[i].re, bad_re[i].err)
+	}
+	for i := 0; i < len(matches); i++ {
+		t := &matches[i];
+		Match(t.re, t.text, &t.match)
+	}
 }
diff --git a/usr/r/regexp/regexp.go b/usr/r/regexp/regexp.go
index c491b26..0a6fd31 100644
--- a/usr/r/regexp/regexp.go
+++ b/usr/r/regexp/regexp.go
@@ -11,6 +11,8 @@
 	"vector";
 )
 
+export var debug = false;
+
 
 export var ErrUnimplemented = os.NewError("unimplemented");
 export var ErrInternal = os.NewError("internal error");
@@ -20,7 +22,6 @@
 export var ErrUnmatchedRbkt = os.NewError("unmatched ']'");
 export var ErrBadRange = os.NewError("bad range in character class");
 export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
-export var ErrEmpty = os.NewError("empty subexpression or alternation");
 export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
 export var ErrBareClosure = os.NewError("closure applies to nothing");
 export var ErrBadBackslash = os.NewError("illegal backslash escape");
@@ -41,10 +42,11 @@
 	error	*os.Error;	// compile- or run-time error; nil if OK
 	inst	*vector.Vector;
 	start	Inst;
+	nbra	int;	// number of brackets in expression, for subexpressions
 }
 
 const (
-	START	// beginning of program: indexer to start
+	START	// beginning of program
 		= iota;
 	END;		// end of program: success
 	BOT;		// '^' beginning of text
@@ -113,8 +115,8 @@
 // --- CHAR a regular character
 type Char struct {
 	next	Inst;
-	char	int;
 	index	int;
+	char	int;
 }
 
 func (char *Char) Type() int { return CHAR }
@@ -143,7 +145,7 @@
 	ranges	*vector.Vector;
 }
 
-func (cclass *CharClass) Type() int { return CHAR }
+func (cclass *CharClass) Type() int { return CHARCLASS }
 func (cclass *CharClass) Next() Inst { return cclass.next }
 func (cclass *CharClass) SetNext(i Inst) { cclass.next = i }
 func (cclass *CharClass) Index() int { return cclass.index }
@@ -170,6 +172,17 @@
 	cclass.ranges.Append(b);
 }
 
+func (cclass *CharClass) Matches(c int) bool {
+	for i := 0; i < cclass.ranges.Len(); i = i+2 {
+		min := cclass.ranges.At(i).(CClassChar);
+		max := cclass.ranges.At(i+1).(CClassChar);
+		if min <= c && c <= max {
+			return !cclass.negate
+		}
+	}
+	return cclass.negate
+}
+
 func NewCharClass() *CharClass {
 	c := new(CharClass);
 	c.ranges = vector.New();
@@ -210,7 +223,7 @@
 	n	int;	// subexpression number
 }
 
-func (ebra *Ebra) Type() int { return BRA }
+func (ebra *Ebra) Type() int { return EBRA }
 func (ebra *Ebra) Next() Inst { return ebra.next }
 func (ebra *Ebra) SetNext(i Inst) { ebra.next = i }
 func (ebra *Ebra) Index() int { return ebra.index }
@@ -259,7 +272,6 @@
 
 type Parser struct {
 	re	*RE;
-	nbra	int;	// number of brackets in expression, for subexpressions
 	nlpar	int;	// number of unclosed lpars
 	pos	int;
 	ch	int;
@@ -314,16 +326,6 @@
 var NULL Inst
 type BUGinter interface{}
 
-// same as i == NULL.  TODO: remove when 6g lets me do i == NULL
-func isNULL(i Inst) bool {
-	return sys.BUG_intereq(i.(BUGinter), NULL.(BUGinter))
-}
-
-// same as i == j.  TODO: remove when 6g lets me do i == j
-func isEQ(i,j Inst) bool {
-	return sys.BUG_intereq(i.(BUGinter), j.(BUGinter))
-}
-
 func special(c int) bool {
 	s := `\.+*?()|[]`;
 	for i := 0; i < len(s); i++ {
@@ -437,15 +439,15 @@
 		}
 		p.nlpar--;
 		p.nextc();
-		p.nbra++;
 		bra := new(Bra);
 		p.re.Add(bra);
 		ebra := new(Ebra);
 		p.re.Add(ebra);
-		bra.n = p.nbra;
-		ebra.n = p.nbra;
-		if isNULL(start) {
-			if !isNULL(end) { p.re.Error(ErrInternal) }
+		p.re.nbra++;	// increment first so first subexpr is \1
+		bra.n = p.re.nbra;
+		ebra.n = p.re.nbra;
+		if start == NULL {
+			if end == NULL { p.re.Error(ErrInternal) }
 			start = ebra
 		} else {
 			end.SetNext(ebra);
@@ -476,7 +478,7 @@
 
 func (p *Parser) Closure() (start, end Inst) {
 	start, end = p.Term();
-	if isNULL(start) {
+	if start == NULL {
 		return start, end
 	}
 	switch p.c() {
@@ -521,13 +523,13 @@
 	for {
 		nstart, nend := p.Closure();
 		switch {
-		case isNULL(nstart):	// end of this concatenation
-			if isNULL(start) {	// this is the empty string
+		case nstart == NULL:	// end of this concatenation
+			if start == NULL {	// this is the empty string
 				nop := p.re.Add(new(Nop));
 				return nop, nop;
 			}
 			return start, end;
-		case isNULL(start):	// this is first element of concatenation
+		case start == NULL:	// this is first element of concatenation
 			start, end = nstart, nend;
 		default:
 			end.SetNext(nstart);
@@ -602,17 +604,20 @@
 	re.start = start;
 	e.SetNext(re.Add(new(End)));
 
-	re.Dump();
-	println();
+	if debug {
+		re.Dump();
+		println();
+	}
 
 	re.EliminateNops();
 
-	re.Dump();
-	println();
-
-	re.Error(ErrUnimplemented);
+	if debug {
+		re.Dump();
+		println();
+	}
 }
 
+
 func Compiler(str string, ch *chan *RE) {
 	re := new(RE);
 	re.expr = str;
@@ -624,13 +629,138 @@
 
 // Public interface has only execute functionality (not yet implemented)
 export type Regexp interface {
-	// Execute() bool
+	Execute(s string) *[]int
 }
 
-// compile in separate goroutine; wait for result
+// Compile in separate goroutine; wait for result
 export func Compile(str string) (regexp Regexp, error *os.Error) {
 	ch := new(chan *RE);
 	go Compiler(str, ch);
 	re := <-ch;
 	return re, re.error
 }
+
+type State struct {
+	inst	Inst;	// next instruction to execute
+	match	*[]int;	// pairs of bracketing submatches. 0th is start,end
+}
+
+// Append new state to to-do list.  Leftmost-longest wins so avoid
+// adding a state that's already active.
+func AddState(s *[]State, inst Inst, match *[]int) *[]State {
+	index := inst.Index();
+	l := len(s);
+	pos := match[0];
+	// TODO: Once the state is a vector and we can do insert, have inputs always
+	// go in order correctly and this "earlier" test is never necessary,
+	for i := 0; i < l; i++ {
+		if s[i].inst.Index() == index && // same instruction
+		   s[i].match[0] < pos {	// earlier match already going; lefmost wins
+		   	return s
+		 }
+	}
+	if l == cap(s) {
+		s1 := new([]State, 2*l)[0:l];
+		for i := 0; i < l; i++ {
+			s1[i] = s[i];
+		}
+		s = s1;
+	}
+	s = s[0:l+1];
+	s[l].inst = inst;
+	s[l].match = match;
+	return s;
+}
+
+func (re *RE) DoExecute(str string, pos int) *[]int {
+	var s [2]*[]State;	// TODO: use a vector when State values (not ptrs) can be vector elements
+	s[0] = new([]State, 10)[0:0];
+	s[1] = new([]State, 10)[0:0];
+	in, out := 0, 1;
+	var final State;
+	found := false;
+	for pos <= len(str) {
+		if !found {
+			// prime the pump if we haven't seen a match yet
+			match := new([]int, 2*(re.nbra+1));
+			match[0]  = pos;
+			s[out] = AddState(s[out], re.start.Next(), match);
+		}
+		in, out = out, in;	// old out state is new in state
+		s[out] = s[out][0:0];	// clear out state
+		if len(s[in]) == 0 {
+			// machine has completed
+			break;
+		}
+		c := EOF;
+		if pos < len(str) {
+			c = int(str[pos])
+		}
+//println("position ", pos, "char", string(c), "in", in, "out", out, "len in", len(s[in]));
+		for i := 0; i < len(s[in]); i++ {
+			state := s[in][i];
+//state.inst.Print(); print("\n");
+			switch s[in][i].inst.Type() {
+			case BOT:
+				if pos == 0 {
+					s[in] = AddState(s[in], state.inst.Next(), state.match)
+				}
+			case EOT:
+				if pos == len(str) {
+					s[in] = AddState(s[in], state.inst.Next(), state.match)
+				}
+			case CHAR:
+				if c == state.inst.(*Char).char {
+					s[out] = AddState(s[out], state.inst.Next(), state.match)
+				}
+			case CHARCLASS:
+				if state.inst.(*CharClass).Matches(c) {
+					s[out] = AddState(s[out], state.inst.Next(), state.match)
+				}
+			case ANY:
+				if c != EOF {
+					s[out] = AddState(s[out], state.inst.Next(), state.match)
+				}
+			case BRA:
+				n := state.inst.(*Bra).n;
+				state.match[2*n] = pos;
+				s[in] = AddState(s[in], state.inst.Next(), state.match);
+			case EBRA:
+				n := state.inst.(*Ebra).n;
+				state.match[2*n+1] = pos;
+				s[in] = AddState(s[in], state.inst.Next(), state.match);
+			case ALT:
+				s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
+				// give other branch a copy of this match vector
+				s1 := new([]int, 2*(re.nbra+1));
+				for i := 0; i < len(s1); i++ {
+					s1[i] = state.match[i]
+				}
+				s[in] = AddState(s[in], state.inst.Next(), s1);
+			case END:
+				// choose leftmost longest
+				if !found ||	// first
+				   state.match[0] < final.match[0] ||	// leftmost
+				   (state.match[0] == final.match[0] && pos > final.match[1])  {	// longest
+					final = state;
+					final.match[1] = pos;
+				}
+				found = true;
+			default:
+				state.inst.Print();
+				panic("unknown instruction in execute");
+			}
+		}
+		pos++;
+	}
+	if !found {
+		return nil
+	}
+//if found { println("found: from ", final.match[0], "to", final.match[1] )}
+	return final.match;
+}
+
+
+func (re *RE) Execute(s string) *[]int {
+	return re.DoExecute(s, 0)
+}