cmd/6g: fix crash in cgen_bmul.

Used to print:
../test/torture.go:116: internal compiler error: bad width: 0463 (../test/torture.go:116) MOVB    ,BX (0, 8)

R=nigeltao, rsc
CC=golang-dev
https://golang.org/cl/6736068
diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c
index 85415b5..74fd0f7 100644
--- a/src/cmd/6g/ggen.c
+++ b/src/cmd/6g/ggen.c
@@ -984,15 +984,10 @@
 void
 cgen_bmul(int op, Node *nl, Node *nr, Node *res)
 {
-	Node n1, n2, *tmp;
+	Node n1, n2, n1b, n2b, *tmp;
 	Type *t;
 	int a;
 
-	// copy from byte to full registers
-	t = types[TUINT64];
-	if(issigned[nl->type->etype])
-		t = types[TINT64];
-
 	// largest ullman on left.
 	if(nl->ullman < nr->ullman) {
 		tmp = nl;
@@ -1000,15 +995,25 @@
 		nr = tmp;
 	}
 
-	regalloc(&n1, t, res);
-	cgen(nl, &n1);
-	regalloc(&n2, t, N);
-	cgen(nr, &n2);
+	// generate operands in "8-bit" registers.
+	regalloc(&n1b, nl->type, res);
+	cgen(nl, &n1b);
+	regalloc(&n2b, nr->type, N);
+	cgen(nr, &n2b);
+
+	// perform full-width multiplication.
+	t = types[TUINT64];
+	if(issigned[nl->type->etype])
+		t = types[TINT64];
+	nodreg(&n1, t, n1b.val.u.reg);
+	nodreg(&n2, t, n2b.val.u.reg);
 	a = optoas(op, t);
 	gins(a, &n2, &n1);
-	regfree(&n2);
+
+	// truncate.
 	gmove(&n1, res);
-	regfree(&n1);
+	regfree(&n1b);
+	regfree(&n2b);
 }
 
 void
diff --git a/test/torture.go b/test/torture.go
index 60870c3..4bce3a1 100644
--- a/test/torture.go
+++ b/test/torture.go
@@ -58,6 +58,64 @@
 		m[0][3]*m[1][2]*m[2][1]*m[3][0]
 }
 
+// Compute the determinant of a 4x4-matrix by the sum
+// over all index permutations.
+func determinantInt(m [4][4]int) int {
+	return m[0][0]*m[1][1]*m[2][2]*m[3][3] -
+		m[0][0]*m[1][1]*m[2][3]*m[3][2] -
+		m[0][0]*m[1][2]*m[2][1]*m[3][3] +
+		m[0][0]*m[1][2]*m[2][3]*m[3][1] +
+		m[0][0]*m[1][3]*m[2][1]*m[3][2] -
+		m[0][0]*m[1][3]*m[2][2]*m[3][1] -
+		m[0][1]*m[1][0]*m[2][2]*m[3][3] +
+		m[0][1]*m[1][0]*m[2][3]*m[3][2] +
+		m[0][1]*m[1][2]*m[2][0]*m[3][3] -
+		m[0][1]*m[1][2]*m[2][3]*m[3][0] -
+		m[0][1]*m[1][3]*m[2][0]*m[3][2] +
+		m[0][1]*m[1][3]*m[2][2]*m[3][0] +
+		m[0][2]*m[1][0]*m[2][1]*m[3][3] -
+		m[0][2]*m[1][0]*m[2][3]*m[3][1] -
+		m[0][2]*m[1][1]*m[2][0]*m[3][3] +
+		m[0][2]*m[1][1]*m[2][3]*m[3][0] +
+		m[0][2]*m[1][3]*m[2][0]*m[3][1] -
+		m[0][2]*m[1][3]*m[2][1]*m[3][0] -
+		m[0][3]*m[1][0]*m[2][1]*m[3][2] +
+		m[0][3]*m[1][0]*m[2][2]*m[3][1] +
+		m[0][3]*m[1][1]*m[2][0]*m[3][2] -
+		m[0][3]*m[1][1]*m[2][2]*m[3][0] -
+		m[0][3]*m[1][2]*m[2][0]*m[3][1] +
+		m[0][3]*m[1][2]*m[2][1]*m[3][0]
+}
+
+// Compute the determinant of a 4x4-matrix by the sum
+// over all index permutations.
+func determinantByte(m [4][4]byte) byte {
+	return m[0][0]*m[1][1]*m[2][2]*m[3][3] -
+		m[0][0]*m[1][1]*m[2][3]*m[3][2] -
+		m[0][0]*m[1][2]*m[2][1]*m[3][3] +
+		m[0][0]*m[1][2]*m[2][3]*m[3][1] +
+		m[0][0]*m[1][3]*m[2][1]*m[3][2] -
+		m[0][0]*m[1][3]*m[2][2]*m[3][1] -
+		m[0][1]*m[1][0]*m[2][2]*m[3][3] +
+		m[0][1]*m[1][0]*m[2][3]*m[3][2] +
+		m[0][1]*m[1][2]*m[2][0]*m[3][3] -
+		m[0][1]*m[1][2]*m[2][3]*m[3][0] -
+		m[0][1]*m[1][3]*m[2][0]*m[3][2] +
+		m[0][1]*m[1][3]*m[2][2]*m[3][0] +
+		m[0][2]*m[1][0]*m[2][1]*m[3][3] -
+		m[0][2]*m[1][0]*m[2][3]*m[3][1] -
+		m[0][2]*m[1][1]*m[2][0]*m[3][3] +
+		m[0][2]*m[1][1]*m[2][3]*m[3][0] +
+		m[0][2]*m[1][3]*m[2][0]*m[3][1] -
+		m[0][2]*m[1][3]*m[2][1]*m[3][0] -
+		m[0][3]*m[1][0]*m[2][1]*m[3][2] +
+		m[0][3]*m[1][0]*m[2][2]*m[3][1] +
+		m[0][3]*m[1][1]*m[2][0]*m[3][2] -
+		m[0][3]*m[1][1]*m[2][2]*m[3][0] -
+		m[0][3]*m[1][2]*m[2][0]*m[3][1] +
+		m[0][3]*m[1][2]*m[2][1]*m[3][0]
+}
+
 // A right-leaning tree of byte multiplications.
 func righttree(a, b, c, d uint8) uint8 {
 	return a * (b * (c * (d *