poly1305: improve performance with asm for ppc64le

This adds an asm implementation for poly1305 on ppc64le, based on
the amd64 asm implementation using the mac interface.

The improvements on a power8 based on the poly1305 benchmarks are:

    name              old time/op   new time/op    delta
    64                  172ns ± 0%      78ns ± 0%   -54.77%  (p=1.000 n=1+1)
    1K                 1.47µs ± 0%    0.59µs ± 0%   -59.69%  (p=1.000 n=1+1)
    2M                 2.84ms ± 0%    1.12ms ± 0%   -60.47%  (p=1.000 n=1+1)
    64Unaligned         172ns ± 0%      78ns ± 0%   -54.59%  (p=1.000 n=1+1)
    1KUnaligned        1.47µs ± 0%    0.59µs ± 0%   -59.69%  (p=1.000 n=1+1)
    2MUnaligned        2.84ms ± 0%    1.13ms ± 0%   -60.23%  (p=1.000 n=1+1)
    Write64             100ns ± 0%      46ns ± 0%   -53.80%  (p=1.000 n=1+1)
    Write1K            1.40µs ± 0%    0.56µs ± 0%   -59.90%  (p=1.000 n=1+1)
    Write2M            2.84ms ± 0%    1.12ms ± 0%   -60.46%  (p=1.000 n=1+1)
    Write64Unaligned    100ns ± 0%      46ns ± 0%   -53.60%  (p=1.000 n=1+1)
    Write1KUnaligned   1.40µs ± 0%    0.56µs ± 0%   -59.90%  (p=1.000 n=1+1)
    Write2MUnaligned   2.84ms ± 0%    1.13ms ± 0%   -60.22%  (p=1.000 n=1+1)

Change-Id: I77cc9bb3645a6b1a6edc414b5651dc37ae4a7410
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/173421
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <mike.munday@ibm.com>
diff --git a/poly1305/mac_noasm.go b/poly1305/mac_noasm.go
index 8387d29..a8dd589 100644
--- a/poly1305/mac_noasm.go
+++ b/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 gccgo appengine
+// +build !amd64,!ppc64le gccgo appengine
 
 package poly1305
 
diff --git a/poly1305/sum_noasm.go b/poly1305/sum_noasm.go
index fcdef46..8a9c207 100644
--- a/poly1305/sum_noasm.go
+++ b/poly1305/sum_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl
+// +build s390x,!go1.11 !arm,!amd64,!s390x,!ppc64le gccgo appengine nacl
 
 package poly1305
 
diff --git a/poly1305/sum_ppc64le.go b/poly1305/sum_ppc64le.go
new file mode 100644
index 0000000..2402b63
--- /dev/null
+++ b/poly1305/sum_ppc64le.go
@@ -0,0 +1,68 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64le,!gccgo,!appengine
+
+package poly1305
+
+//go:noescape
+func initialize(state *[7]uint64, key *[32]byte)
+
+//go:noescape
+func update(state *[7]uint64, msg []byte)
+
+//go:noescape
+func finalize(tag *[TagSize]byte, state *[7]uint64)
+
+// Sum generates an authenticator for m using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[16]byte, m []byte, key *[32]byte) {
+	h := newMAC(key)
+	h.Write(m)
+	h.Sum(out)
+}
+
+func newMAC(key *[32]byte) (h mac) {
+	initialize(&h.state, key)
+	return
+}
+
+type mac struct {
+	state [7]uint64 // := uint64{ h0, h1, h2, r0, r1, pad0, pad1 }
+
+	buffer [TagSize]byte
+	offset int
+}
+
+func (h *mac) Write(p []byte) (n int, err error) {
+	n = len(p)
+	if h.offset > 0 {
+		remaining := TagSize - h.offset
+		if n < remaining {
+			h.offset += copy(h.buffer[h.offset:], p)
+			return n, nil
+		}
+		copy(h.buffer[h.offset:], p[:remaining])
+		p = p[remaining:]
+		h.offset = 0
+		update(&h.state, h.buffer[:])
+	}
+	if nn := len(p) - (len(p) % TagSize); nn > 0 {
+		update(&h.state, p[:nn])
+		p = p[nn:]
+	}
+	if len(p) > 0 {
+		h.offset += copy(h.buffer[h.offset:], p)
+	}
+	return n, nil
+}
+
+func (h *mac) Sum(out *[16]byte) {
+	state := h.state
+	if h.offset > 0 {
+		update(&state, h.buffer[:h.offset])
+	}
+	finalize(out, &state)
+}
diff --git a/poly1305/sum_ppc64le.s b/poly1305/sum_ppc64le.s
new file mode 100644
index 0000000..55c7167
--- /dev/null
+++ b/poly1305/sum_ppc64le.s
@@ -0,0 +1,247 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64le,!gccgo,!appengine
+
+#include "textflag.h"
+
+// This was ported from the amd64 implementation.
+
+#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
+	MOVD (msg), t0;  \
+	MOVD 8(msg), t1; \
+	MOVD $1, t2;     \
+	ADDC t0, h0, h0; \
+	ADDE t1, h1, h1; \
+	ADDE t2, h2;     \
+	ADD  $16, msg
+
+#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
+	MULLD  r0, h0, t0;  \
+	MULLD  r0, h1, t4;  \
+	MULHDU r0, h0, t1;  \
+	MULHDU r0, h1, t5;  \
+	ADDC   t4, t1, t1;  \
+	MULLD  r0, h2, t2;  \
+	ADDZE  t5;          \
+	MULHDU r1, h0, t4;  \
+	MULLD  r1, h0, h0;  \
+	ADD    t5, t2, t2;  \
+	ADDC   h0, t1, t1;  \
+	MULLD  h2, r1, t3;  \
+	ADDZE  t4, h0;      \
+	MULHDU r1, h1, t5;  \
+	MULLD  r1, h1, t4;  \
+	ADDC   t4, t2, t2;  \
+	ADDE   t5, t3, t3;  \
+	ADDC   h0, t2, t2;  \
+	MOVD   $-4, t4;     \
+	MOVD   t0, h0;      \
+	MOVD   t1, h1;      \
+	ADDZE  t3;          \
+	ANDCC  $3, t2, h2;  \
+	AND    t2, t4, t0;  \
+	ADDC   t0, h0, h0;  \
+	ADDE   t3, h1, h1;  \
+	SLD    $62, t3, t4; \
+	SRD    $2, t2;      \
+	ADDZE  h2;          \
+	OR     t4, t2, t2;  \
+	SRD    $2, t3;      \
+	ADDC   t2, h0, h0;  \
+	ADDE   t3, h1, h1;  \
+	ADDZE  h2
+
+DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+GLOBL ·poly1305Mask<>(SB), RODATA, $16
+
+// func update(state *[7]uint64, msg []byte)
+
+TEXT ·update(SB), $0-32
+	MOVD state+0(FP), R3
+	MOVD msg_base+8(FP), R4
+	MOVD msg_len+16(FP), R5
+
+	MOVD 0(R3), R8   // h0
+	MOVD 8(R3), R9   // h1
+	MOVD 16(R3), R10 // h2
+	MOVD 24(R3), R11 // r0
+	MOVD 32(R3), R12 // r1
+
+	CMP R5, $16
+	BLT bytes_between_0_and_15
+
+loop:
+	POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
+
+multiply:
+	POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
+	ADD $-16, R5
+	CMP R5, $16
+	BGE loop
+
+bytes_between_0_and_15:
+	CMP  $0, R5
+	BEQ  done
+	MOVD $0, R16 // h0
+	MOVD $0, R17 // h1
+
+flush_buffer:
+	CMP R5, $8
+	BLE just1
+
+	MOVD $8, R21
+	SUB  R21, R5, R21
+
+	// Greater than 8 -- load the rightmost remaining bytes in msg
+	// and put into R17 (h1)
+	MOVD (R4)(R21), R17
+	MOVD $16, R22
+
+	// Find the offset to those bytes
+	SUB R5, R22, R22
+	SLD $3, R22
+
+	// Shift to get only the bytes in msg
+	SRD R22, R17, R17
+
+	// Put 1 at high end
+	MOVD $1, R23
+	SLD  $3, R21
+	SLD  R21, R23, R23
+	OR   R23, R17, R17
+
+	// Remainder is 8
+	MOVD $8, R5
+
+just1:
+	CMP R5, $8
+	BLT less8
+
+	// Exactly 8
+	MOVD (R4), R16
+
+	CMP $0, R17
+
+	// Check if we've already set R17; if not
+	// set 1 to indicate end of msg.
+	BNE  carry
+	MOVD $1, R17
+	BR   carry
+
+less8:
+	MOVD  $0, R16   // h0
+	MOVD  $0, R22   // shift count
+	CMP   R5, $4
+	BLT   less4
+	MOVWZ (R4), R16
+	ADD   $4, R4
+	ADD   $-4, R5
+	MOVD  $32, R22
+
+less4:
+	CMP   R5, $2
+	BLT   less2
+	MOVHZ (R4), R21
+	SLD   R22, R21, R21
+	OR    R16, R21, R16
+	ADD   $16, R22
+	ADD   $-2, R5
+	ADD   $2, R4
+
+less2:
+	CMP   $0, R5
+	BEQ   insert1
+	MOVBZ (R4), R21
+	SLD   R22, R21, R21
+	OR    R16, R21, R16
+	ADD   $8, R22
+
+insert1:
+	// Insert 1 at end of msg
+	MOVD $1, R21
+	SLD  R22, R21, R21
+	OR   R16, R21, R16
+
+carry:
+	// Add new values to h0, h1, h2
+	ADDC R16, R8
+	ADDE R17, R9
+	ADDE $0, R10
+	MOVD $16, R5
+	ADD  R5, R4
+	BR   multiply
+
+done:
+	// Save h0, h1, h2 in state
+	MOVD R8, 0(R3)
+	MOVD R9, 8(R3)
+	MOVD R10, 16(R3)
+	RET
+
+// func initialize(state *[7]uint64, key *[32]byte)
+TEXT ·initialize(SB), $0-16
+	MOVD state+0(FP), R3
+	MOVD key+8(FP), R4
+
+	// state[0...7] is initialized with zero
+	// Load key
+	MOVD 0(R4), R5
+	MOVD 8(R4), R6
+	MOVD 16(R4), R7
+	MOVD 24(R4), R8
+
+	// Address of key mask
+	MOVD $·poly1305Mask<>(SB), R9
+
+	// Save original key in state
+	MOVD R7, 40(R3)
+	MOVD R8, 48(R3)
+
+	// Get mask
+	MOVD (R9), R7
+	MOVD 8(R9), R8
+
+	// And with key
+	AND R5, R7, R5
+	AND R6, R8, R6
+
+	// Save masked key in state
+	MOVD R5, 24(R3)
+	MOVD R6, 32(R3)
+	RET
+
+// func finalize(tag *[TagSize]byte, state *[7]uint64)
+TEXT ·finalize(SB), $0-16
+	MOVD tag+0(FP), R3
+	MOVD state+8(FP), R4
+
+	// Get h0, h1, h2 from state
+	MOVD 0(R4), R5
+	MOVD 8(R4), R6
+	MOVD 16(R4), R7
+
+	// Save h0, h1
+	MOVD  R5, R8
+	MOVD  R6, R9
+	MOVD  $3, R20
+	MOVD  $-1, R21
+	SUBC  $-5, R5
+	SUBE  R21, R6
+	SUBE  R20, R7
+	MOVD  $0, R21
+	SUBZE R21
+
+	// Check for carry
+	CMP  $0, R21
+	ISEL $2, R5, R8, R5
+	ISEL $2, R6, R9, R6
+	MOVD 40(R4), R8
+	MOVD 48(R4), R9
+	ADDC R8, R5
+	ADDE R9, R6
+	MOVD R5, 0(R3)
+	MOVD R6, 8(R3)
+	RET