x/crypto/internal/poly1305: improve sum_ppc64le.s

This contains a few minor improvements to sum_ppc64le.s
which result in up to 10% performance improvement for
some of the benchmarks in this directory.

- ADDZE followed by ADD can be combined into ADDE
- PCALIGN added to the loop
- Eliminate a few unnecessary register moves

goos: linux
goarch: ppc64le
pkg: golang.org/x/crypto/internal/poly1305
cpu: POWER10
                 │ poly.orig.out │              poly.out              │
                 │    sec/op     │   sec/op     vs base               │
64                   40.34n ± 0%   38.13n ± 0%   -5.47% (p=0.002 n=6)
1K                   482.2n ± 0%   444.6n ± 0%   -7.81% (p=0.002 n=6)
2M                   978.4µ ± 0%   879.3µ ± 0%  -10.12% (p=0.002 n=6)
64Unaligned          40.35n ± 0%   38.16n ± 0%   -5.42% (p=0.002 n=6)
1KUnaligned          482.0n ± 0%   444.2n ± 0%   -7.84% (p=0.002 n=6)
2MUnaligned          978.4µ ± 0%   879.4µ ± 0%  -10.12% (p=0.002 n=6)
Write64              32.69n ± 0%   30.71n ± 0%   -6.04% (p=0.002 n=6)
Write1K              472.4n ± 0%   436.5n ± 0%   -7.60% (p=0.002 n=6)
Write2M              978.3µ ± 0%   879.4µ ± 0%  -10.11% (p=0.002 n=6)
Write64Unaligned     32.67n ± 0%   30.71n ± 0%   -6.00% (p=0.002 n=6)
Write1KUnaligned     472.6n ± 0%   436.4n ± 0%   -7.66% (p=0.002 n=6)
Write2MUnaligned     978.5µ ± 0%   879.6µ ± 0%  -10.10% (p=0.002 n=6)
geomean              2.569µ        2.367µ        -7.87%

Change-Id: I63314e7252ef10fb2d157f623c4bc2e31a63ae32
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/558775
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64le.s
index d2ca5de..b3c1699 100644
--- a/internal/poly1305/sum_ppc64le.s
+++ b/internal/poly1305/sum_ppc64le.s
@@ -19,15 +19,14 @@
 
 #define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
 	MULLD  r0, h0, t0;  \
-	MULLD  r0, h1, t4;  \
 	MULHDU r0, h0, t1;  \
+	MULLD  r0, h1, t4;  \
 	MULHDU r0, h1, t5;  \
 	ADDC   t4, t1, t1;  \
 	MULLD  r0, h2, t2;  \
-	ADDZE  t5;          \
 	MULHDU r1, h0, t4;  \
 	MULLD  r1, h0, h0;  \
-	ADD    t5, t2, t2;  \
+	ADDE   t5, t2, t2;  \
 	ADDC   h0, t1, t1;  \
 	MULLD  h2, r1, t3;  \
 	ADDZE  t4, h0;      \
@@ -37,13 +36,11 @@
 	ADDE   t5, t3, t3;  \
 	ADDC   h0, t2, t2;  \
 	MOVD   $-4, t4;     \
-	MOVD   t0, h0;      \
-	MOVD   t1, h1;      \
 	ADDZE  t3;          \
-	ANDCC  $3, t2, h2;  \
-	AND    t2, t4, t0;  \
+	RLDICL $0, t2, $62, h2; \
+	AND    t2, t4, h0;  \
 	ADDC   t0, h0, h0;  \
-	ADDE   t3, h1, h1;  \
+	ADDE   t3, t1, h1;  \
 	SLD    $62, t3, t4; \
 	SRD    $2, t2;      \
 	ADDZE  h2;          \
@@ -75,6 +72,7 @@
 loop:
 	POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
 
+	PCALIGN $16
 multiply:
 	POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
 	ADD $-16, R5