cmd/compile: optimize ARM's math.Abs

This CL optimizes math.Abs to an inline ABSD instruction on ARM.

The benchmark results of src/math/ show big improvements.
name                   old time/op  new time/op  delta
Acos-4                  181ns ± 0%   182ns ± 0%   +0.30%  (p=0.000 n=40+40)
Acosh-4                 202ns ± 0%   202ns ± 0%     ~     (all equal)
Asin-4                  163ns ± 0%   163ns ± 0%     ~     (all equal)
Asinh-4                 242ns ± 0%   242ns ± 0%     ~     (all equal)
Atan-4                  120ns ± 0%   121ns ± 0%   +0.83%  (p=0.000 n=40+40)
Atanh-4                 202ns ± 0%   202ns ± 0%     ~     (all equal)
Atan2-4                 173ns ± 0%   173ns ± 0%     ~     (all equal)
Cbrt-4                 1.06µs ± 0%  1.06µs ± 0%   +0.09%  (p=0.000 n=39+37)
Ceil-4                 72.9ns ± 0%  72.8ns ± 0%     ~     (p=0.237 n=40+40)
Copysign-4             13.2ns ± 0%  13.2ns ± 0%     ~     (all equal)
Cos-4                   193ns ± 0%   183ns ± 0%   -5.18%  (p=0.000 n=40+40)
Cosh-4                  254ns ± 0%   239ns ± 0%   -5.91%  (p=0.000 n=40+40)
Erf-4                   112ns ± 0%   112ns ± 0%     ~     (all equal)
Erfc-4                  117ns ± 0%   117ns ± 0%     ~     (all equal)
Erfinv-4                127ns ± 0%   127ns ± 1%     ~     (p=0.492 n=40+40)
Erfcinv-4               128ns ± 0%   128ns ± 0%     ~     (all equal)
Exp-4                   212ns ± 0%   206ns ± 0%   -3.05%  (p=0.000 n=40+40)
ExpGo-4                 216ns ± 0%   209ns ± 0%   -3.24%  (p=0.000 n=40+40)
Expm1-4                 142ns ± 0%   142ns ± 0%     ~     (all equal)
Exp2-4                  191ns ± 0%   184ns ± 0%   -3.45%  (p=0.000 n=40+40)
Exp2Go-4                194ns ± 0%   187ns ± 0%   -3.61%  (p=0.000 n=40+40)
Abs-4                  14.4ns ± 0%   6.3ns ± 0%  -56.39%  (p=0.000 n=38+39)
Dim-4                  12.6ns ± 0%  12.6ns ± 0%     ~     (all equal)
Floor-4                49.6ns ± 0%  49.6ns ± 0%     ~     (all equal)
Max-4                  27.6ns ± 0%  27.6ns ± 0%     ~     (all equal)
Min-4                  27.0ns ± 0%  27.0ns ± 0%     ~     (all equal)
Mod-4                   349ns ± 0%   305ns ± 1%  -12.55%  (p=0.000 n=33+40)
Frexp-4                54.0ns ± 0%  47.1ns ± 0%  -12.78%  (p=0.000 n=38+38)
Gamma-4                 242ns ± 0%   234ns ± 0%   -3.16%  (p=0.000 n=36+40)
Hypot-4                84.8ns ± 0%  67.8ns ± 0%  -20.05%  (p=0.000 n=31+35)
HypotGo-4              88.5ns ± 0%  71.6ns ± 0%  -19.12%  (p=0.000 n=40+38)
Ilogb-4                45.8ns ± 0%  38.9ns ± 0%  -15.12%  (p=0.000 n=40+32)
J0-4                    821ns ± 0%   802ns ± 0%   -2.33%  (p=0.000 n=33+40)
J1-4                    816ns ± 0%   807ns ± 0%   -1.05%  (p=0.000 n=40+29)
Jn-4                   1.67µs ± 0%  1.65µs ± 0%   -1.45%  (p=0.000 n=40+39)
Ldexp-4                61.5ns ± 0%  54.6ns ± 0%  -11.27%  (p=0.000 n=40+32)
Lgamma-4                188ns ± 0%   188ns ± 0%     ~     (all equal)
Log-4                   154ns ± 0%   147ns ± 0%   -4.78%  (p=0.000 n=40+40)
Logb-4                 50.9ns ± 0%  42.7ns ± 0%  -16.11%  (p=0.000 n=34+39)
Log1p-4                 160ns ± 0%   159ns ± 0%     ~     (p=0.828 n=40+40)
Log10-4                 173ns ± 0%   166ns ± 0%   -4.05%  (p=0.000 n=40+40)
Log2-4                 65.3ns ± 0%  58.4ns ± 0%  -10.57%  (p=0.000 n=37+37)
Modf-4                 36.4ns ± 0%  36.4ns ± 0%     ~     (all equal)
Nextafter32-4          36.4ns ± 0%  36.4ns ± 0%     ~     (all equal)
Nextafter64-4          32.7ns ± 0%  32.6ns ± 0%     ~     (p=0.375 n=40+40)
PowInt-4                300ns ± 0%   277ns ± 0%   -7.78%  (p=0.000 n=40+40)
PowFrac-4               676ns ± 0%   635ns ± 0%   -6.00%  (p=0.000 n=40+35)
Pow10Pos-4             17.6ns ± 0%  17.6ns ± 0%     ~     (all equal)
Pow10Neg-4             22.0ns ± 0%  22.0ns ± 0%     ~     (all equal)
Round-4                30.1ns ± 0%  30.1ns ± 0%     ~     (all equal)
RoundToEven-4          38.9ns ± 0%  38.9ns ± 0%     ~     (all equal)
Remainder-4             291ns ± 0%   263ns ± 0%   -9.62%  (p=0.000 n=40+40)
Signbit-4              11.3ns ± 0%  11.3ns ± 0%     ~     (all equal)
Sin-4                   185ns ± 0%   185ns ± 0%     ~     (all equal)
Sincos-4                230ns ± 0%   230ns ± 0%     ~     (all equal)
Sinh-4                  253ns ± 0%   246ns ± 0%   -2.77%  (p=0.000 n=39+39)
SqrtIndirect-4         41.4ns ± 0%  41.4ns ± 0%     ~     (all equal)
SqrtLatency-4          13.8ns ± 0%  13.8ns ± 0%     ~     (all equal)
SqrtIndirectLatency-4  37.0ns ± 0%  37.0ns ± 0%     ~     (p=0.632 n=40+40)
SqrtGoLatency-4         911ns ± 0%   911ns ± 0%   +0.08%  (p=0.000 n=40+40)
SqrtPrime-4            13.2µs ± 0%  13.2µs ± 0%   +0.01%  (p=0.038 n=38+40)
Tan-4                   205ns ± 0%   205ns ± 0%     ~     (all equal)
Tanh-4                  264ns ± 0%   247ns ± 0%   -6.44%  (p=0.000 n=39+32)
Trunc-4                45.2ns ± 0%  45.2ns ± 0%     ~     (all equal)
Y0-4                    796ns ± 0%   792ns ± 0%   -0.55%  (p=0.000 n=35+40)
Y1-4                    804ns ± 0%   797ns ± 0%   -0.82%  (p=0.000 n=24+40)
Yn-4                   1.64µs ± 0%  1.62µs ± 0%   -1.27%  (p=0.000 n=40+39)
Float64bits-4          8.16ns ± 0%  8.16ns ± 0%   +0.04%  (p=0.000 n=35+40)
Float64frombits-4      10.7ns ± 0%  10.7ns ± 0%     ~     (all equal)
Float32bits-4          7.53ns ± 0%  7.53ns ± 0%     ~     (p=0.760 n=40+40)
Float32frombits-4      6.91ns ± 0%  6.91ns ± 0%   -0.04%  (p=0.002 n=32+38)
[Geo mean]              111ns        106ns        -3.98%

Change-Id: I54f4fd7f5160db020b430b556bde59cc0fdb996d
Reviewed-on: https://go-review.googlesource.com/c/go/+/188678
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/arm/ssa.go b/src/cmd/compile/internal/arm/ssa.go
index 1675297..0b798a5 100644
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
@@ -655,6 +655,7 @@
 		ssa.OpARMSQRTD,
 		ssa.OpARMNEGF,
 		ssa.OpARMNEGD,
+		ssa.OpARMABSD,
 		ssa.OpARMMOVWF,
 		ssa.OpARMMOVWD,
 		ssa.OpARMMOVFW,
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 9871e11..e1c464b 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3297,7 +3297,7 @@
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpAbs, types.Types[TFLOAT64], args[0])
 		},
-		sys.ARM64, sys.PPC64, sys.Wasm)
+		sys.ARM64, sys.ARM, sys.PPC64, sys.Wasm)
 	addF("math", "Copysign",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue2(ssa.OpCopysign, types.Types[TFLOAT64], args[0], args[1])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules
index 7986114..87a91b1 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -56,6 +56,7 @@
 (Com(32|16|8) x) -> (MVN x)
 
 (Sqrt x) -> (SQRTD x)
+(Abs x) -> (ABSD x)
 
 // TODO: optimize this for ARMv5 and ARMv6
 (Ctz32NonZero x) -> (Ctz32 x)
diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go
index d8bdfeb..484f6cf 100644
--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@@ -211,6 +211,7 @@
 		{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
 		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
 		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+		{name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"},   // abs(arg0), float64
 
 		{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},     // count leading zero
 		{name: "REV", argLength: 1, reg: gp11, asm: "REV"},     // reverse byte order
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 00e49c9..f316ea1 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -926,6 +926,7 @@
 	OpARMNEGF
 	OpARMNEGD
 	OpARMSQRTD
+	OpARMABSD
 	OpARMCLZ
 	OpARMREV
 	OpARMREV16
@@ -12299,6 +12300,19 @@
 		},
 	},
 	{
+		name:   "ABSD",
+		argLen: 1,
+		asm:    arm.AABSD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+		},
+	},
+	{
 		name:   "CLZ",
 		argLen: 1,
 		asm:    arm.ACLZ,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go
index 6a32374..8b56978 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@@ -420,6 +420,8 @@
 		return rewriteValueARM_OpARMXORshiftRLreg_0(v)
 	case OpARMXORshiftRR:
 		return rewriteValueARM_OpARMXORshiftRR_0(v)
+	case OpAbs:
+		return rewriteValueARM_OpAbs_0(v)
 	case OpAdd16:
 		return rewriteValueARM_OpAdd16_0(v)
 	case OpAdd32:
@@ -17179,6 +17181,17 @@
 	}
 	return false
 }
+func rewriteValueARM_OpAbs_0(v *Value) bool {
+	// match: (Abs x)
+	// cond:
+	// result: (ABSD x)
+	for {
+		x := v.Args[0]
+		v.reset(OpARMABSD)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueARM_OpAdd16_0(v *Value) bool {
 	// match: (Add16 x y)
 	// cond:
diff --git a/test/codegen/math.go b/test/codegen/math.go
index 597271c..3625271 100644
--- a/test/codegen/math.go
+++ b/test/codegen/math.go
@@ -63,6 +63,7 @@
 	// ppc64:"FABS\t"
 	// ppc64le:"FABS\t"
 	// wasm:"F64Abs"
+	// arm/6:"ABSD\t"
 	sink64[0] = math.Abs(x)
 
 	// amd64:"BTRQ\t[$]63","PXOR"    (TODO: this should be BTSQ)