cmd/compile: add s390x intrinsics for Ceil, Floor, Round and Trunc

Ceil, Floor and Trunc are pre-existing intrinsics. Round is a new
function and has been added as an intrinsic in this CL. All of the
functions can be implemented as a single 'LOAD FP INTEGER'
instruction, FIDBR, on s390x.

name   old time/op  new time/op  delta
Ceil   2.34ns ± 0%  0.85ns ± 0%  -63.74%  (p=0.000 n=5+4)
Floor  2.33ns ± 0%  0.85ns ± 1%  -63.35%  (p=0.008 n=5+5)
Round  4.23ns ± 0%  0.85ns ± 0%  -79.89%  (p=0.000 n=5+4)
Trunc  2.35ns ± 0%  0.85ns ± 0%  -63.83%  (p=0.029 n=4+4)

Change-Id: Idee7ba24a2899d12bf9afee4eedd6b4aaad3c510
Reviewed-on: https://go-review.googlesource.com/63890
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index 8a627e7..d03ca32 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -107,7 +107,12 @@
 (Bswap64 x) -> (MOVDBR x)
 (Bswap32 x) -> (MOVWBR x)
 
-(Sqrt x) -> (FSQRT x)
+// math package intrinsics
+(Sqrt  x) -> (FSQRT x)
+(Floor x) -> (FIDBR [7] x)
+(Ceil  x) -> (FIDBR [6] x)
+(Trunc x) -> (FIDBR [5] x)
+(Round x) -> (FIDBR [1] x)
 
 // Atomic loads.
 (AtomicLoad32 ptr mem) -> (MOVWZatomicload ptr mem)
diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go
index 2a08a27..b330398 100644
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -206,6 +206,17 @@
 		{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", resultInArg0: true},                                             // fp32 arg1 * arg2 - arg0
 		{name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB", resultInArg0: true},                                               // fp64 arg1 * arg2 - arg0
 
+		// Round to integer, float64 only.
+		//
+		// aux | rounding mode
+		// ----+-----------------------------------
+		//   1 | round to nearest, ties away from 0
+		//   4 | round to nearest, ties to even
+		//   5 | round toward 0
+		//   6 | round toward +∞
+		//   7 | round toward -∞
+		{name: "FIDBR", argLength: 1, reg: fp11, asm: "FIDBR", aux: "Int8"},
+
 		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
 		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
 		{name: "FMOVSconst", reg: fp01, asm: "FMOVS", aux: "Float32", rematerializeable: true},                               // fp32 constant
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
index 6f8d10a..2967d29 100644
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -255,10 +255,23 @@
 	{name: "PopCount32", argLength: 1}, // Count bits in arg[0]
 	{name: "PopCount64", argLength: 1}, // Count bits in arg[0]
 
-	{name: "Sqrt", argLength: 1},  // sqrt(arg0), float64 only
-	{name: "Floor", argLength: 1}, // floor(arg0), float64 only
-	{name: "Ceil", argLength: 1},  // ceil(arg0), float64 only
-	{name: "Trunc", argLength: 1}, // trunc(arg0), float64 only
+	// Square root, float64 only.
+	// Special cases:
+	//   +∞  → +∞
+	//   ±0  → ±0 (sign preserved)
+	//   x<0 → NaN
+	//   NaN → NaN
+	{name: "Sqrt", argLength: 1}, // √arg0
+
+	// Round to integer, float64 only.
+	// Special cases:
+	//   ±∞  → ±∞ (sign preserved)
+	//   ±0  → ±0 (sign preserved)
+	//   NaN → NaN
+	{name: "Floor", argLength: 1}, // round arg0 toward -∞
+	{name: "Ceil", argLength: 1},  // round arg0 toward +∞
+	{name: "Trunc", argLength: 1}, // round arg0 toward 0
+	{name: "Round", argLength: 1}, // round arg0 to nearest, ties away from 0
 
 	// Data movement, max argument length for Phi is indefinite so just pick
 	// a really large number