math/big: add PCALIGN to addMulVVW asm on ppc64x
Adding PCALIGN to addMulVVW assembler implementation
provides the following improvement on power10:
AddMulVVW/1 3.36ns ± 0% 3.37ns ± 0% +0.20%
AddMulVVW/2 4.45ns ± 0% 4.44ns ± 0% -0.25%
AddMulVVW/3 5.44ns ± 0% 5.49ns ± 0% +0.84%
AddMulVVW/4 6.43ns ± 0% 6.34ns ± 0% -1.33%
AddMulVVW/5 7.87ns ± 0% 7.73ns ± 0% -1.70%
AddMulVVW/10 13.4ns ± 3% 12.4ns ± 7% -7.07%
AddMulVVW/100 112ns ± 0% 102ns ± 0% -9.34%
AddMulVVW/1000 1.09µs ± 0% 0.95µs ± 0% -13.15%
AddMulVVW/10000 10.9µs ± 0% 9.6µs ± 0% -12.46%
AddMulVVW/100000 109µs ± 0% 95µs ± 0% -12.58%
Change-Id: Ic33d4f125c84d568f63e17cf99dc4df5ca9328d9
Reviewed-on: https://go-review.googlesource.com/c/go/+/447236
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Archana Ravindar <ravindararchana@gmail.com>
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index a83696a..5fdbf40 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -44,6 +44,8 @@
// for small values of z_len (0.90x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
+
+ PCALIGN $32
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
@@ -131,6 +133,8 @@
// for small values of z_len (0.92x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
+
+ PCALIGN $32
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
@@ -212,6 +216,7 @@
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
+ PCALIGN $32
loop:
MOVD 8(R8), R20 // R20 = x[i]
@@ -288,6 +293,8 @@
// The loop here is almost the same as the one used in s390x, but
// we don't need to capture CA every iteration because we've already
// done that above.
+
+ PCALIGN $32
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
@@ -358,6 +365,7 @@
CMP R5, R0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
+ PCALIGN $32
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
@@ -520,6 +528,7 @@
CMP R0, R14
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
+ PCALIGN $32
loop:
MOVD 8(R8), R20 // R20 = x[i]
@@ -602,6 +611,7 @@
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
+ PCALIGN $32
loop:
MOVD (R8)(R3), R20 // Load x[i]