cmd/compile: optimize ARM's math.Abs
This CL optimizes math.Abs to an inline ABSD instruction on ARM.
The benchmark results of src/math/ show big improvements.
name old time/op new time/op delta
Acos-4 181ns ± 0% 182ns ± 0% +0.30% (p=0.000 n=40+40)
Acosh-4 202ns ± 0% 202ns ± 0% ~ (all equal)
Asin-4 163ns ± 0% 163ns ± 0% ~ (all equal)
Asinh-4 242ns ± 0% 242ns ± 0% ~ (all equal)
Atan-4 120ns ± 0% 121ns ± 0% +0.83% (p=0.000 n=40+40)
Atanh-4 202ns ± 0% 202ns ± 0% ~ (all equal)
Atan2-4 173ns ± 0% 173ns ± 0% ~ (all equal)
Cbrt-4 1.06µs ± 0% 1.06µs ± 0% +0.09% (p=0.000 n=39+37)
Ceil-4 72.9ns ± 0% 72.8ns ± 0% ~ (p=0.237 n=40+40)
Copysign-4 13.2ns ± 0% 13.2ns ± 0% ~ (all equal)
Cos-4 193ns ± 0% 183ns ± 0% -5.18% (p=0.000 n=40+40)
Cosh-4 254ns ± 0% 239ns ± 0% -5.91% (p=0.000 n=40+40)
Erf-4 112ns ± 0% 112ns ± 0% ~ (all equal)
Erfc-4 117ns ± 0% 117ns ± 0% ~ (all equal)
Erfinv-4 127ns ± 0% 127ns ± 1% ~ (p=0.492 n=40+40)
Erfcinv-4 128ns ± 0% 128ns ± 0% ~ (all equal)
Exp-4 212ns ± 0% 206ns ± 0% -3.05% (p=0.000 n=40+40)
ExpGo-4 216ns ± 0% 209ns ± 0% -3.24% (p=0.000 n=40+40)
Expm1-4 142ns ± 0% 142ns ± 0% ~ (all equal)
Exp2-4 191ns ± 0% 184ns ± 0% -3.45% (p=0.000 n=40+40)
Exp2Go-4 194ns ± 0% 187ns ± 0% -3.61% (p=0.000 n=40+40)
Abs-4 14.4ns ± 0% 6.3ns ± 0% -56.39% (p=0.000 n=38+39)
Dim-4 12.6ns ± 0% 12.6ns ± 0% ~ (all equal)
Floor-4 49.6ns ± 0% 49.6ns ± 0% ~ (all equal)
Max-4 27.6ns ± 0% 27.6ns ± 0% ~ (all equal)
Min-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal)
Mod-4 349ns ± 0% 305ns ± 1% -12.55% (p=0.000 n=33+40)
Frexp-4 54.0ns ± 0% 47.1ns ± 0% -12.78% (p=0.000 n=38+38)
Gamma-4 242ns ± 0% 234ns ± 0% -3.16% (p=0.000 n=36+40)
Hypot-4 84.8ns ± 0% 67.8ns ± 0% -20.05% (p=0.000 n=31+35)
HypotGo-4 88.5ns ± 0% 71.6ns ± 0% -19.12% (p=0.000 n=40+38)
Ilogb-4 45.8ns ± 0% 38.9ns ± 0% -15.12% (p=0.000 n=40+32)
J0-4 821ns ± 0% 802ns ± 0% -2.33% (p=0.000 n=33+40)
J1-4 816ns ± 0% 807ns ± 0% -1.05% (p=0.000 n=40+29)
Jn-4 1.67µs ± 0% 1.65µs ± 0% -1.45% (p=0.000 n=40+39)
Ldexp-4 61.5ns ± 0% 54.6ns ± 0% -11.27% (p=0.000 n=40+32)
Lgamma-4 188ns ± 0% 188ns ± 0% ~ (all equal)
Log-4 154ns ± 0% 147ns ± 0% -4.78% (p=0.000 n=40+40)
Logb-4 50.9ns ± 0% 42.7ns ± 0% -16.11% (p=0.000 n=34+39)
Log1p-4 160ns ± 0% 159ns ± 0% ~ (p=0.828 n=40+40)
Log10-4 173ns ± 0% 166ns ± 0% -4.05% (p=0.000 n=40+40)
Log2-4 65.3ns ± 0% 58.4ns ± 0% -10.57% (p=0.000 n=37+37)
Modf-4 36.4ns ± 0% 36.4ns ± 0% ~ (all equal)
Nextafter32-4 36.4ns ± 0% 36.4ns ± 0% ~ (all equal)
Nextafter64-4 32.7ns ± 0% 32.6ns ± 0% ~ (p=0.375 n=40+40)
PowInt-4 300ns ± 0% 277ns ± 0% -7.78% (p=0.000 n=40+40)
PowFrac-4 676ns ± 0% 635ns ± 0% -6.00% (p=0.000 n=40+35)
Pow10Pos-4 17.6ns ± 0% 17.6ns ± 0% ~ (all equal)
Pow10Neg-4 22.0ns ± 0% 22.0ns ± 0% ~ (all equal)
Round-4 30.1ns ± 0% 30.1ns ± 0% ~ (all equal)
RoundToEven-4 38.9ns ± 0% 38.9ns ± 0% ~ (all equal)
Remainder-4 291ns ± 0% 263ns ± 0% -9.62% (p=0.000 n=40+40)
Signbit-4 11.3ns ± 0% 11.3ns ± 0% ~ (all equal)
Sin-4 185ns ± 0% 185ns ± 0% ~ (all equal)
Sincos-4 230ns ± 0% 230ns ± 0% ~ (all equal)
Sinh-4 253ns ± 0% 246ns ± 0% -2.77% (p=0.000 n=39+39)
SqrtIndirect-4 41.4ns ± 0% 41.4ns ± 0% ~ (all equal)
SqrtLatency-4 13.8ns ± 0% 13.8ns ± 0% ~ (all equal)
SqrtIndirectLatency-4 37.0ns ± 0% 37.0ns ± 0% ~ (p=0.632 n=40+40)
SqrtGoLatency-4 911ns ± 0% 911ns ± 0% +0.08% (p=0.000 n=40+40)
SqrtPrime-4 13.2µs ± 0% 13.2µs ± 0% +0.01% (p=0.038 n=38+40)
Tan-4 205ns ± 0% 205ns ± 0% ~ (all equal)
Tanh-4 264ns ± 0% 247ns ± 0% -6.44% (p=0.000 n=39+32)
Trunc-4 45.2ns ± 0% 45.2ns ± 0% ~ (all equal)
Y0-4 796ns ± 0% 792ns ± 0% -0.55% (p=0.000 n=35+40)
Y1-4 804ns ± 0% 797ns ± 0% -0.82% (p=0.000 n=24+40)
Yn-4 1.64µs ± 0% 1.62µs ± 0% -1.27% (p=0.000 n=40+39)
Float64bits-4 8.16ns ± 0% 8.16ns ± 0% +0.04% (p=0.000 n=35+40)
Float64frombits-4 10.7ns ± 0% 10.7ns ± 0% ~ (all equal)
Float32bits-4 7.53ns ± 0% 7.53ns ± 0% ~ (p=0.760 n=40+40)
Float32frombits-4 6.91ns ± 0% 6.91ns ± 0% -0.04% (p=0.002 n=32+38)
[Geo mean] 111ns 106ns -3.98%
Change-Id: I54f4fd7f5160db020b430b556bde59cc0fdb996d
Reviewed-on: https://go-review.googlesource.com/c/go/+/188678
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/arm/ssa.go b/src/cmd/compile/internal/arm/ssa.go
index 1675297..0b798a5 100644
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
@@ -655,6 +655,7 @@
ssa.OpARMSQRTD,
ssa.OpARMNEGF,
ssa.OpARMNEGD,
+ ssa.OpARMABSD,
ssa.OpARMMOVWF,
ssa.OpARMMOVWD,
ssa.OpARMMOVFW,
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 9871e11..e1c464b 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3297,7 +3297,7 @@
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpAbs, types.Types[TFLOAT64], args[0])
},
- sys.ARM64, sys.PPC64, sys.Wasm)
+ sys.ARM64, sys.ARM, sys.PPC64, sys.Wasm)
addF("math", "Copysign",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpCopysign, types.Types[TFLOAT64], args[0], args[1])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules
index 7986114..87a91b1 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -56,6 +56,7 @@
(Com(32|16|8) x) -> (MVN x)
(Sqrt x) -> (SQRTD x)
+(Abs x) -> (ABSD x)
// TODO: optimize this for ARMv5 and ARMv6
(Ctz32NonZero x) -> (Ctz32 x)
diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go
index d8bdfeb..484f6cf 100644
--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@@ -211,6 +211,7 @@
{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+ {name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64
{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero
{name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // reverse byte order
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 00e49c9..f316ea1 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -926,6 +926,7 @@
OpARMNEGF
OpARMNEGD
OpARMSQRTD
+ OpARMABSD
OpARMCLZ
OpARMREV
OpARMREV16
@@ -12299,6 +12300,19 @@
},
},
{
+ name: "ABSD",
+ argLen: 1,
+ asm: arm.AABSD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+ },
+ outputs: []outputInfo{
+ {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+ },
+ },
+ },
+ {
name: "CLZ",
argLen: 1,
asm: arm.ACLZ,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go
index 6a32374..8b56978 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@@ -420,6 +420,8 @@
return rewriteValueARM_OpARMXORshiftRLreg_0(v)
case OpARMXORshiftRR:
return rewriteValueARM_OpARMXORshiftRR_0(v)
+ case OpAbs:
+ return rewriteValueARM_OpAbs_0(v)
case OpAdd16:
return rewriteValueARM_OpAdd16_0(v)
case OpAdd32:
@@ -17179,6 +17181,17 @@
}
return false
}
+func rewriteValueARM_OpAbs_0(v *Value) bool {
+ // match: (Abs x)
+ // cond:
+ // result: (ABSD x)
+ for {
+ x := v.Args[0]
+ v.reset(OpARMABSD)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM_OpAdd16_0(v *Value) bool {
// match: (Add16 x y)
// cond:
diff --git a/test/codegen/math.go b/test/codegen/math.go
index 597271c..3625271 100644
--- a/test/codegen/math.go
+++ b/test/codegen/math.go
@@ -63,6 +63,7 @@
// ppc64:"FABS\t"
// ppc64le:"FABS\t"
// wasm:"F64Abs"
+ // arm/6:"ABSD\t"
sink64[0] = math.Abs(x)
// amd64:"BTRQ\t[$]63","PXOR" (TODO: this should be BTSQ)