x/crypto/internal/poly1305: improve sum_ppc64le.s
This contains a few minor improvements to sum_ppc64le.s
which result in up to 10% performance improvement for
some of the benchmarks in this directory.
- ADDZE followed by ADD can be combined into ADDE
- PCALIGN added to the loop
- Eliminate a few unnecessary register moves
goos: linux
goarch: ppc64le
pkg: golang.org/x/crypto/internal/poly1305
cpu: POWER10
│ poly.orig.out │ poly.out │
│ sec/op │ sec/op vs base │
64 40.34n ± 0% 38.13n ± 0% -5.47% (p=0.002 n=6)
1K 482.2n ± 0% 444.6n ± 0% -7.81% (p=0.002 n=6)
2M 978.4µ ± 0% 879.3µ ± 0% -10.12% (p=0.002 n=6)
64Unaligned 40.35n ± 0% 38.16n ± 0% -5.42% (p=0.002 n=6)
1KUnaligned 482.0n ± 0% 444.2n ± 0% -7.84% (p=0.002 n=6)
2MUnaligned 978.4µ ± 0% 879.4µ ± 0% -10.12% (p=0.002 n=6)
Write64 32.69n ± 0% 30.71n ± 0% -6.04% (p=0.002 n=6)
Write1K 472.4n ± 0% 436.5n ± 0% -7.60% (p=0.002 n=6)
Write2M 978.3µ ± 0% 879.4µ ± 0% -10.11% (p=0.002 n=6)
Write64Unaligned 32.67n ± 0% 30.71n ± 0% -6.00% (p=0.002 n=6)
Write1KUnaligned 472.6n ± 0% 436.4n ± 0% -7.66% (p=0.002 n=6)
Write2MUnaligned 978.5µ ± 0% 879.6µ ± 0% -10.10% (p=0.002 n=6)
geomean 2.569µ 2.367µ -7.87%
Change-Id: I63314e7252ef10fb2d157f623c4bc2e31a63ae32
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/558775
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64le.s
index d2ca5de..b3c1699 100644
--- a/internal/poly1305/sum_ppc64le.s
+++ b/internal/poly1305/sum_ppc64le.s
@@ -19,15 +19,14 @@
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
MULLD r0, h0, t0; \
- MULLD r0, h1, t4; \
MULHDU r0, h0, t1; \
+ MULLD r0, h1, t4; \
MULHDU r0, h1, t5; \
ADDC t4, t1, t1; \
MULLD r0, h2, t2; \
- ADDZE t5; \
MULHDU r1, h0, t4; \
MULLD r1, h0, h0; \
- ADD t5, t2, t2; \
+ ADDE t5, t2, t2; \
ADDC h0, t1, t1; \
MULLD h2, r1, t3; \
ADDZE t4, h0; \
@@ -37,13 +36,11 @@
ADDE t5, t3, t3; \
ADDC h0, t2, t2; \
MOVD $-4, t4; \
- MOVD t0, h0; \
- MOVD t1, h1; \
ADDZE t3; \
- ANDCC $3, t2, h2; \
- AND t2, t4, t0; \
+ RLDICL $0, t2, $62, h2; \
+ AND t2, t4, h0; \
ADDC t0, h0, h0; \
- ADDE t3, h1, h1; \
+ ADDE t3, t1, h1; \
SLD $62, t3, t4; \
SRD $2, t2; \
ADDZE h2; \
@@ -75,6 +72,7 @@
loop:
POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
+ PCALIGN $16
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
ADD $-16, R5