add some tests
fix some bugs in () ordering and rune processing

R=rsc
DELTA=72  (27 added, 5 deleted, 40 changed)
OCL=17147
CL=17147
diff --git a/usr/r/regexp/main.go b/usr/r/regexp/main.go
index 25ec07a..c89f9b5 100644
--- a/usr/r/regexp/main.go
+++ b/usr/r/regexp/main.go
@@ -10,22 +10,22 @@
 )
 
 var good_re = []string{
-	``
-,	`.`
-,	`^.$`
-,	`a`
-,	`a*`
-,	`a+`
-,	`a?`
-,	`a|b`
-,	`a*|b*`
-,	`(a*|b)(c*|d)`
-,	`[a-z]`
-,	`[a-abc-c\-\]\[]`
-,	`[a-z]+`
-,	`[]`
-,	`[abc]`
-,	`[^1234]`
+	``,
+	`.`,
+	`^.$`,
+	`a`,
+	`a*`,
+	`a+`,
+	`a?`,
+	`a|b`,
+	`a*|b*`,
+	`(a*|b)(c*|d)`,
+	`[a-z]`,
+	`[a-abc-c\-\]\[]`,
+	`[a-z]+`,
+	`[]`,
+	`[abc]`,
+	`[^1234]`,
 }
 
 // TODO: nice to do this with a map but we don't have an iterator
@@ -45,7 +45,7 @@
 	StringError{ `a*+`,	regexp.ErrBadClosure },	
 	StringError{ `a??`,	regexp.ErrBadClosure },	
 	StringError{ `*`,	 	regexp.ErrBareClosure },	
-	StringError{ `\x`,	regexp.ErrBadBackslash }
+	StringError{ `\x`,	regexp.ErrBadBackslash },
 }
 
 type Vec [20]int;
@@ -56,17 +56,33 @@
 	match	Vec;
 }
 
+const END = -1000
+
 var matches = []Tester {
-	Tester{ ``,	"",	Vec{0,0, -1,-1} },
-	Tester{ `a`,	"a",	Vec{0,1, -1,-1} },
-	Tester{ `b`,	"abc",	Vec{1,2, -1,-1} },
-	Tester{ `.`,	"a",	Vec{0,1, -1,-1} },
-	Tester{ `.*`,	"abcdef",	Vec{0,6, -1,-1} },
-	Tester{ `^abcd$`,	"abcd",	Vec{0,4, -1,-1} },
-	Tester{ `^bcd'`,	"abcdef",	Vec{-1,-1} },
-	Tester{ `^abcd$`,	"abcde",	Vec{-1,-1} },
-	Tester{ `a+`,	"baaab",	Vec{1, 4, -1,-1} },
-	Tester{ `a*`,	"baaab",	Vec{0, 0, -1,-1} }
+	Tester{ ``,	"",	Vec{0,0, END} },
+	Tester{ `a`,	"a",	Vec{0,1, END} },
+	Tester{ `b`,	"abc",	Vec{1,2, END} },
+	Tester{ `.`,	"a",	Vec{0,1, END} },
+	Tester{ `.*`,	"abcdef",	Vec{0,6, END} },
+	Tester{ `^abcd$`,	"abcd",	Vec{0,4, END} },
+	Tester{ `^bcd'`,	"abcdef",	Vec{END} },
+	Tester{ `^abcd$`,	"abcde",	Vec{END} },
+	Tester{ `a+`,	"baaab",	Vec{1,4, END} },
+	Tester{ `a*`,	"baaab",	Vec{0,0, END} },
+	Tester{ `[a-z]+`,	"abcd",	Vec{0,4, END} },
+	Tester{ `[^a-z]+`,	"ab1234cd",	Vec{2,6, END} },
+	Tester{ `[a\-\]z]+`,	"az]-bcz",	Vec{0,4, END} },
+	Tester{ `[日本語]+`,	"日本語日本語",	Vec{0,18, END} },
+	Tester{ `()`,	"",	Vec{0,0, 0,0, END} },
+	Tester{ `(a)`,	"a",	Vec{0,1, 0,1, END} },
+	Tester{ `(.)(.)`,	"日a",	Vec{0,4, 0,3, 3,4, END} },
+	Tester{ `(.*)`,	"",	Vec{0,0, 0,0, END} },
+	Tester{ `(.*)`,	"abcd",	Vec{0,4, 0,4, END} },
+	Tester{ `(..)(..)`,	"abcd",	Vec{0,4, 0,2, 2,4, END} },
+	Tester{ `(([^xyz]*)(d))`,	"abcd",	Vec{0,4, 0,4, 0,3, 3,4, END} },
+	Tester{ `((a|b|c)*(d))`,	"abcd",	Vec{0,4, 0,4, 2,3, 3,4, END} },
+	Tester{ `(((a|b|c)*)(d))`,	"abcd",	Vec{0,4, 0,4, 0,3, 2,3, 3,4, END} },
+	Tester{ `a*(|(b))c*`,	"aacc",	Vec{0,4, 2,2, -1,-1, END} },
 }
 
 func Compile(expr string, error *os.Error) regexp.Regexp {
@@ -83,15 +99,19 @@
 		return 0
 	}
 	var i int;
-	for i = 0; i < len(m) && m[i] >= 0; i = i+2 {
+	for i = 0; i < len(m) && m[i] != END; i = i+2 {
 	}
 	return i
 }
 
 func PrintVec(m *[] int) {
 	l := MarkedLen(m);
-	for i := 0; i < l && m[i] >= 0; i = i+2 {
-		print(m[i], ",", m[i+1], " ")
+	if l == 0 {
+		print("<no match>");
+	} else {
+		for i := 0; i < l && m[i] != END; i = i+2 {
+			print(m[i], ",", m[i+1], " ")
+		}
 	}
 }
 
@@ -122,6 +142,7 @@
 }
 
 func main() {
+	//regexp.debug = true;
 	if sys.argc() > 1 {
 		Compile(sys.argv(1), nil);
 		sys.exit(0);
diff --git a/usr/r/regexp/regexp.go b/usr/r/regexp/regexp.go
index 0a6fd31..6535e6e 100644
--- a/usr/r/regexp/regexp.go
+++ b/usr/r/regexp/regexp.go
@@ -287,7 +287,6 @@
 	if p.pos >= len(p.re.expr) {
 		p.ch = EOF
 	} else {
-		// TODO: stringotorune should take a string*
 		c, w := sys.stringtorune(p.re.expr, p.pos);
 		p.ch = c;
 		p.pos += w;
@@ -433,6 +432,8 @@
 	case '(':
 		p.nextc();
 		p.nlpar++;
+		p.re.nbra++;	// increment first so first subexpr is \1
+		nbra := p.re.nbra;
 		start, end = p.Regexp();
 		if p.c() != ')' {
 			p.re.Error(ErrUnmatchedLpar);
@@ -443,9 +444,8 @@
 		p.re.Add(bra);
 		ebra := new(Ebra);
 		p.re.Add(ebra);
-		p.re.nbra++;	// increment first so first subexpr is \1
-		bra.n = p.re.nbra;
-		ebra.n = p.re.nbra;
+		bra.n = nbra;
+		ebra.n = nbra;
 		if start == NULL {
 			if end == NULL { p.re.Error(ErrInternal) }
 			start = ebra
@@ -479,7 +479,7 @@
 func (p *Parser) Closure() (start, end Inst) {
 	start, end = p.Term();
 	if start == NULL {
-		return start, end
+		return
 	}
 	switch p.c() {
 	case '*':
@@ -509,13 +509,13 @@
 		start = alt;	// start is now alt
 		end = nop;	// end is nop pointed to by both branches
 	default:
-		return start, end;
+		return
 	}
 	switch p.nextc() {
 	case '*', '+', '?':
 		p.re.Error(ErrBadClosure);
 	}
-	return start, end;
+	return
 }
 
 func (p *Parser) Concatenation() (start, end Inst) {
@@ -528,7 +528,7 @@
 				nop := p.re.Add(new(Nop));
 				return nop, nop;
 			}
-			return start, end;
+			return;
 		case start == NULL:	// this is first element of concatenation
 			start, end = nstart, nend;
 		default:
@@ -544,7 +544,7 @@
 	for {
 		switch p.c() {
 		default:
-			return start, end;
+			return;
 		case '|':
 			p.nextc();
 			nstart, nend := p.Concatenation();
@@ -683,6 +683,9 @@
 		if !found {
 			// prime the pump if we haven't seen a match yet
 			match := new([]int, 2*(re.nbra+1));
+			for i := 0; i < len(match); i++ {
+				match[i] = -1;	// no match seen; catches cases like "a(b)?c" on "ac"
+			}
 			match[0]  = pos;
 			s[out] = AddState(s[out], re.start.Next(), match);
 		}
@@ -692,14 +695,13 @@
 			// machine has completed
 			break;
 		}
+		charwidth := 1;
 		c := EOF;
 		if pos < len(str) {
-			c = int(str[pos])
+			c, charwidth = sys.stringtorune(str, pos);
 		}
-//println("position ", pos, "char", string(c), "in", in, "out", out, "len in", len(s[in]));
 		for i := 0; i < len(s[in]); i++ {
 			state := s[in][i];
-//state.inst.Print(); print("\n");
 			switch s[in][i].inst.Type() {
 			case BOT:
 				if pos == 0 {
@@ -751,12 +753,11 @@
 				panic("unknown instruction in execute");
 			}
 		}
-		pos++;
+		pos += charwidth;
 	}
 	if !found {
 		return nil
 	}
-//if found { println("found: from ", final.match[0], "to", final.match[1] )}
 	return final.match;
 }