[dev.ssa] cmd/compile: Use movups for xmm->xmm mov

Movups is 1 byte smaller than movapd that we currently use.

Change-Id: I22f771f066529352722a28543535ec43497cb9c5
Reviewed-on: https://go-review.googlesource.com/19938
Reviewed-by: David Chase <drchase@google.com>
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 8e68c20..b46016f 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -4999,7 +4999,9 @@
 	if t.IsFloat() {
 		// Moving the whole sse2 register is faster
 		// than moving just the correct low portion of it.
-		return x86.AMOVAPD
+		// There is no xmm->xmm move with 1 byte opcode,
+		// so use movups, which has 2 byte opcode.
+		return x86.AMOVUPS
 	} else {
 		switch t.Size() {
 		case 1: