[dev.ssa] cmd/compile: add FP comparison ops

Basic ops, no particular optimization in the pattern
matching yet (e.g. x!=x for Nan detection, x cmp constant,
etc.)

Change-Id: I0043564081d6dc0eede876c4a9eb3c33cbd1521c
Reviewed-on: https://go-review.googlesource.com/13704
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 8e44ede..676de23 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -848,6 +848,8 @@
 	opAndType{OEQ, TCHAN}:      ssa.OpEqPtr,
 	opAndType{OEQ, TUINTPTR}:   ssa.OpEqPtr,
 	opAndType{OEQ, TUNSAFEPTR}: ssa.OpEqPtr,
+	opAndType{OEQ, TFLOAT64}:   ssa.OpEq64F,
+	opAndType{OEQ, TFLOAT32}:   ssa.OpEq32F,
 
 	opAndType{ONE, TBOOL}:      ssa.OpNeq8,
 	opAndType{ONE, TINT8}:      ssa.OpNeq8,
@@ -866,42 +868,52 @@
 	opAndType{ONE, TCHAN}:      ssa.OpNeqPtr,
 	opAndType{ONE, TUINTPTR}:   ssa.OpNeqPtr,
 	opAndType{ONE, TUNSAFEPTR}: ssa.OpNeqPtr,
+	opAndType{ONE, TFLOAT64}:   ssa.OpNeq64F,
+	opAndType{ONE, TFLOAT32}:   ssa.OpNeq32F,
 
-	opAndType{OLT, TINT8}:   ssa.OpLess8,
-	opAndType{OLT, TUINT8}:  ssa.OpLess8U,
-	opAndType{OLT, TINT16}:  ssa.OpLess16,
-	opAndType{OLT, TUINT16}: ssa.OpLess16U,
-	opAndType{OLT, TINT32}:  ssa.OpLess32,
-	opAndType{OLT, TUINT32}: ssa.OpLess32U,
-	opAndType{OLT, TINT64}:  ssa.OpLess64,
-	opAndType{OLT, TUINT64}: ssa.OpLess64U,
+	opAndType{OLT, TINT8}:    ssa.OpLess8,
+	opAndType{OLT, TUINT8}:   ssa.OpLess8U,
+	opAndType{OLT, TINT16}:   ssa.OpLess16,
+	opAndType{OLT, TUINT16}:  ssa.OpLess16U,
+	opAndType{OLT, TINT32}:   ssa.OpLess32,
+	opAndType{OLT, TUINT32}:  ssa.OpLess32U,
+	opAndType{OLT, TINT64}:   ssa.OpLess64,
+	opAndType{OLT, TUINT64}:  ssa.OpLess64U,
+	opAndType{OLT, TFLOAT64}: ssa.OpLess64F,
+	opAndType{OLT, TFLOAT32}: ssa.OpLess32F,
 
-	opAndType{OGT, TINT8}:   ssa.OpGreater8,
-	opAndType{OGT, TUINT8}:  ssa.OpGreater8U,
-	opAndType{OGT, TINT16}:  ssa.OpGreater16,
-	opAndType{OGT, TUINT16}: ssa.OpGreater16U,
-	opAndType{OGT, TINT32}:  ssa.OpGreater32,
-	opAndType{OGT, TUINT32}: ssa.OpGreater32U,
-	opAndType{OGT, TINT64}:  ssa.OpGreater64,
-	opAndType{OGT, TUINT64}: ssa.OpGreater64U,
+	opAndType{OGT, TINT8}:    ssa.OpGreater8,
+	opAndType{OGT, TUINT8}:   ssa.OpGreater8U,
+	opAndType{OGT, TINT16}:   ssa.OpGreater16,
+	opAndType{OGT, TUINT16}:  ssa.OpGreater16U,
+	opAndType{OGT, TINT32}:   ssa.OpGreater32,
+	opAndType{OGT, TUINT32}:  ssa.OpGreater32U,
+	opAndType{OGT, TINT64}:   ssa.OpGreater64,
+	opAndType{OGT, TUINT64}:  ssa.OpGreater64U,
+	opAndType{OGT, TFLOAT64}: ssa.OpGreater64F,
+	opAndType{OGT, TFLOAT32}: ssa.OpGreater32F,
 
-	opAndType{OLE, TINT8}:   ssa.OpLeq8,
-	opAndType{OLE, TUINT8}:  ssa.OpLeq8U,
-	opAndType{OLE, TINT16}:  ssa.OpLeq16,
-	opAndType{OLE, TUINT16}: ssa.OpLeq16U,
-	opAndType{OLE, TINT32}:  ssa.OpLeq32,
-	opAndType{OLE, TUINT32}: ssa.OpLeq32U,
-	opAndType{OLE, TINT64}:  ssa.OpLeq64,
-	opAndType{OLE, TUINT64}: ssa.OpLeq64U,
+	opAndType{OLE, TINT8}:    ssa.OpLeq8,
+	opAndType{OLE, TUINT8}:   ssa.OpLeq8U,
+	opAndType{OLE, TINT16}:   ssa.OpLeq16,
+	opAndType{OLE, TUINT16}:  ssa.OpLeq16U,
+	opAndType{OLE, TINT32}:   ssa.OpLeq32,
+	opAndType{OLE, TUINT32}:  ssa.OpLeq32U,
+	opAndType{OLE, TINT64}:   ssa.OpLeq64,
+	opAndType{OLE, TUINT64}:  ssa.OpLeq64U,
+	opAndType{OLE, TFLOAT64}: ssa.OpLeq64F,
+	opAndType{OLE, TFLOAT32}: ssa.OpLeq32F,
 
-	opAndType{OGE, TINT8}:   ssa.OpGeq8,
-	opAndType{OGE, TUINT8}:  ssa.OpGeq8U,
-	opAndType{OGE, TINT16}:  ssa.OpGeq16,
-	opAndType{OGE, TUINT16}: ssa.OpGeq16U,
-	opAndType{OGE, TINT32}:  ssa.OpGeq32,
-	opAndType{OGE, TUINT32}: ssa.OpGeq32U,
-	opAndType{OGE, TINT64}:  ssa.OpGeq64,
-	opAndType{OGE, TUINT64}: ssa.OpGeq64U,
+	opAndType{OGE, TINT8}:    ssa.OpGeq8,
+	opAndType{OGE, TUINT8}:   ssa.OpGeq8U,
+	opAndType{OGE, TINT16}:   ssa.OpGeq16,
+	opAndType{OGE, TUINT16}:  ssa.OpGeq16U,
+	opAndType{OGE, TINT32}:   ssa.OpGeq32,
+	opAndType{OGE, TUINT32}:  ssa.OpGeq32U,
+	opAndType{OGE, TINT64}:   ssa.OpGeq64,
+	opAndType{OGE, TUINT64}:  ssa.OpGeq64U,
+	opAndType{OGE, TFLOAT64}: ssa.OpGeq64F,
+	opAndType{OGE, TFLOAT32}: ssa.OpGeq32F,
 
 	opAndType{OLROT, TUINT8}:  ssa.OpLrot8,
 	opAndType{OLROT, TUINT16}: ssa.OpLrot16,
@@ -2198,7 +2210,7 @@
 }
 
 // opregreg emits instructions for
-//     dest := dest op src
+//     dest := dest(To) op src(From)
 // and also returns the created obj.Prog so it
 // may be further adjusted (offset, scale, etc).
 func opregreg(op int, dest, src int16) *obj.Prog {
@@ -2522,11 +2534,11 @@
 		p.To.Reg = regnum(v)
 	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
 		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB:
-		p := Prog(v.Op.Asm())
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = regnum(v.Args[0])
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = regnum(v.Args[1])
+		opregreg(v.Op.Asm(), regnum(v.Args[1]), regnum(v.Args[0]))
+	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
+		// Go assembler has swapped operands for UCOMISx relative to CMP,
+		// must account for that right here.
+		opregreg(v.Op.Asm(), regnum(v.Args[0]), regnum(v.Args[1]))
 	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst,
 		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst:
 		p := Prog(v.Op.Asm())
@@ -2763,11 +2775,34 @@
 	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
 		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
 		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
+		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
 		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
+		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
 		ssa.OpAMD64SETA, ssa.OpAMD64SETAE:
 		p := Prog(v.Op.Asm())
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = regnum(v)
+
+	case ssa.OpAMD64SETNEF:
+		p := Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = regnum(v)
+		q := Prog(x86.ASETPS)
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = x86.REG_AX
+		// TODO AORQ copied from old code generator, why not AORB?
+		opregreg(x86.AORQ, regnum(v), x86.REG_AX)
+
+	case ssa.OpAMD64SETEQF:
+		p := Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = regnum(v)
+		q := Prog(x86.ASETPC)
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = x86.REG_AX
+		// TODO AANDQ copied from old code generator, why not AANDB?
+		opregreg(x86.AANDQ, regnum(v), x86.REG_AX)
+
 	case ssa.OpAMD64InvertFlags:
 		v.Fatalf("InvertFlags should never make it to codegen %v", v)
 	case ssa.OpAMD64REPSTOSQ:
@@ -2808,7 +2843,9 @@
 	return nleft, offset
 }
 
-var blockJump = [...]struct{ asm, invasm int }{
+var blockJump = [...]struct {
+	asm, invasm int
+}{
 	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
 	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
 	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
@@ -2819,6 +2856,63 @@
 	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
 	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
 	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
+	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
+	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
+}
+
+type floatingEQNEJump struct {
+	jump, index int
+}
+
+var eqfJumps = [2][2]floatingEQNEJump{
+	{{x86.AJNE, 1}, {x86.AJPS, 1}}, // next == b.Succs[0]
+	{{x86.AJNE, 1}, {x86.AJPC, 0}}, // next == b.Succs[1]
+}
+var nefJumps = [2][2]floatingEQNEJump{
+	{{x86.AJNE, 0}, {x86.AJPC, 1}}, // next == b.Succs[0]
+	{{x86.AJNE, 0}, {x86.AJPS, 0}}, // next == b.Succs[1]
+}
+
+func oneFPJump(b *ssa.Block, jumps *floatingEQNEJump, likely ssa.BranchPrediction, branches []branch) []branch {
+	p := Prog(jumps.jump)
+	p.To.Type = obj.TYPE_BRANCH
+	to := jumps.index
+	branches = append(branches, branch{p, b.Succs[to]})
+	if to == 1 {
+		likely = -likely
+	}
+	// liblink reorders the instruction stream as it sees fit.
+	// Pass along what we know so liblink can make use of it.
+	// TODO: Once we've fully switched to SSA,
+	// make liblink leave our output alone.
+	switch likely {
+	case ssa.BranchUnlikely:
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 0
+	case ssa.BranchLikely:
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+	}
+	return branches
+}
+
+func genFPJump(b, next *ssa.Block, jumps *[2][2]floatingEQNEJump, branches []branch) []branch {
+	likely := b.Likely
+	switch next {
+	case b.Succs[0]:
+		branches = oneFPJump(b, &jumps[0][0], likely, branches)
+		branches = oneFPJump(b, &jumps[0][1], likely, branches)
+	case b.Succs[1]:
+		branches = oneFPJump(b, &jumps[1][0], likely, branches)
+		branches = oneFPJump(b, &jumps[1][1], likely, branches)
+	default:
+		branches = oneFPJump(b, &jumps[1][0], likely, branches)
+		branches = oneFPJump(b, &jumps[1][1], likely, branches)
+		q := Prog(obj.AJMP)
+		q.To.Type = obj.TYPE_BRANCH
+		branches = append(branches, branch{q, b.Succs[1]})
+	}
+	return branches
 }
 
 func genBlock(b, next *ssa.Block, branches []branch) []branch {
@@ -2849,12 +2943,18 @@
 			p.To.Type = obj.TYPE_BRANCH
 			branches = append(branches, branch{p, b.Succs[0]})
 		}
+
+	case ssa.BlockAMD64EQF:
+		branches = genFPJump(b, next, &eqfJumps, branches)
+
+	case ssa.BlockAMD64NEF:
+		branches = genFPJump(b, next, &nefJumps, branches)
+
 	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
 		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
 		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
 		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
 		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
-
 		jmp := blockJump[b.Kind]
 		likely := b.Likely
 		var p *obj.Prog