crypto/cipher: use Neon for xor on arm64

cpu: HiSilicon(R) Kirin 970 2.4GHz

name                 old time/op    new time/op    delta
XORBytes/8Bytes        39.8ns ± 0%    17.3ns ± 0%    -56.53%  (p=0.000 n=10+10)
XORBytes/128Bytes       376ns ± 0%      28ns ± 0%    -92.63%  (p=0.000 n=10+8)
XORBytes/2048Bytes     5.67µs ± 0%    0.22µs ± 0%    -96.03%  (p=0.000 n=10+10)
XORBytes/32768Bytes    90.3µs ± 0%     3.5µs ± 0%    -96.12%  (p=0.000 n=10+10)
AESGCMSeal1K            853ns ± 0%     853ns ± 0%       ~     (all equal)
AESGCMOpen1K            876ns ± 0%     874ns ± 0%     -0.23%  (p=0.000 n=10+10)
AESGCMSign8K           3.09µs ± 0%    3.08µs ± 0%     -0.34%  (p=0.000 n=10+9)
AESGCMSeal8K           5.87µs ± 0%    5.87µs ± 0%     +0.01%  (p=0.008 n=10+8)
AESGCMOpen8K           5.82µs ± 0%    5.82µs ± 0%     +0.02%  (p=0.037 n=10+10)
AESCFBEncrypt1K        7.05µs ± 0%    4.27µs ± 0%    -39.38%  (p=0.000 n=10+10)
AESCFBDecrypt1K        7.12µs ± 0%    4.30µs ± 0%    -39.54%  (p=0.000 n=10+9)
AESCFBDecrypt8K        56.7µs ± 0%    34.1µs ± 0%    -39.82%  (p=0.000 n=10+10)
AESOFB1K               5.20µs ± 0%    2.54µs ± 0%    -51.07%  (p=0.000 n=10+10)
AESCTR1K               4.96µs ± 0%    2.30µs ± 0%    -53.62%  (p=0.000 n=9+10)
AESCTR8K               39.5µs ± 0%    18.2µs ± 0%    -53.98%  (p=0.000 n=8+10)
AESCBCEncrypt1K        5.81µs ± 0%    3.07µs ± 0%    -47.13%  (p=0.000 n=10+8)
AESCBCDecrypt1K        5.83µs ± 0%    3.10µs ± 0%    -46.84%  (p=0.000 n=10+8)

name                 old speed      new speed      delta
XORBytes/8Bytes       201MB/s ± 0%   461MB/s ± 0%   +129.80%  (p=0.000 n=6+10)
XORBytes/128Bytes     340MB/s ± 0%  4625MB/s ± 0%  +1259.91%  (p=0.000 n=8+10)
XORBytes/2048Bytes    361MB/s ± 0%  9088MB/s ± 0%  +2414.23%  (p=0.000 n=8+10)
XORBytes/32768Bytes   363MB/s ± 0%  9350MB/s ± 0%  +2477.44%  (p=0.000 n=10+10)
AESGCMSeal1K         1.20GB/s ± 0%  1.20GB/s ± 0%     -0.02%  (p=0.041 n=10+10)
AESGCMOpen1K         1.17GB/s ± 0%  1.17GB/s ± 0%     +0.20%  (p=0.000 n=10+10)
AESGCMSign8K         2.65GB/s ± 0%  2.66GB/s ± 0%     +0.35%  (p=0.000 n=10+9)
AESGCMSeal8K         1.40GB/s ± 0%  1.40GB/s ± 0%     -0.01%  (p=0.000 n=10+7)
AESGCMOpen8K         1.41GB/s ± 0%  1.41GB/s ± 0%     -0.03%  (p=0.022 n=10+10)
AESCFBEncrypt1K       145MB/s ± 0%   238MB/s ± 0%    +64.95%  (p=0.000 n=10+10)
AESCFBDecrypt1K       143MB/s ± 0%   237MB/s ± 0%    +65.39%  (p=0.000 n=10+9)
AESCFBDecrypt8K       144MB/s ± 0%   240MB/s ± 0%    +66.15%  (p=0.000 n=10+10)
AESOFB1K              196MB/s ± 0%   401MB/s ± 0%   +104.35%  (p=0.000 n=9+10)
AESCTR1K              205MB/s ± 0%   443MB/s ± 0%   +115.57%  (p=0.000 n=7+10)
AESCTR8K              207MB/s ± 0%   450MB/s ± 0%   +117.27%  (p=0.000 n=10+10)
AESCBCEncrypt1K       176MB/s ± 0%   334MB/s ± 0%    +89.15%  (p=0.000 n=10+8)
AESCBCDecrypt1K       176MB/s ± 0%   330MB/s ± 0%    +88.08%  (p=0.000 n=10+9)

Updates #42010

Change-Id: I75e6d66fd0070e184d93b020c55a7580c713647c
Reviewed-on: https://go-review.googlesource.com/c/go/+/142537
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Run-TryBot: Meng Zhuo <mzh@golangcn.org>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Meng Zhuo <mzh@golangcn.org>
diff --git a/src/crypto/cipher/xor_arm64.go b/src/crypto/cipher/xor_arm64.go
new file mode 100644
index 0000000..35a785a
--- /dev/null
+++ b/src/crypto/cipher/xor_arm64.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cipher
+
+// xorBytes xors the bytes in a and b. The destination should have enough
+// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if n == 0 {
+		return 0
+	}
+	// make sure dst has enough space
+	_ = dst[n-1]
+
+	xorBytesARM64(&dst[0], &a[0], &b[0], n)
+	return n
+}
+
+func xorWords(dst, a, b []byte) {
+	xorBytes(dst, a, b)
+}
+
+//go:noescape
+func xorBytesARM64(dst, a, b *byte, n int)
diff --git a/src/crypto/cipher/xor_arm64.s b/src/crypto/cipher/xor_arm64.s
new file mode 100644
index 0000000..669852d
--- /dev/null
+++ b/src/crypto/cipher/xor_arm64.s
@@ -0,0 +1,67 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func xorBytesARM64(dst, a, b *byte, n int)
+TEXT ·xorBytesARM64(SB), NOSPLIT|NOFRAME, $0
+	MOVD	dst+0(FP), R0
+	MOVD	a+8(FP), R1
+	MOVD	b+16(FP), R2
+	MOVD	n+24(FP), R3
+	CMP	$64, R3
+	BLT	tail
+loop_64:
+	VLD1.P	64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VLD1.P	64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
+	VEOR	V0.B16, V4.B16, V4.B16
+	VEOR	V1.B16, V5.B16, V5.B16
+	VEOR	V2.B16, V6.B16, V6.B16
+	VEOR	V3.B16, V7.B16, V7.B16
+	VST1.P	[V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
+	SUBS	$64, R3
+	CMP	$64, R3
+	BGE	loop_64
+tail:
+	// quick end
+	CBZ	R3, end
+	TBZ	$5, R3, less_than32
+	VLD1.P	32(R1), [V0.B16, V1.B16]
+	VLD1.P	32(R2), [V2.B16, V3.B16]
+	VEOR	V0.B16, V2.B16, V2.B16
+	VEOR	V1.B16, V3.B16, V3.B16
+	VST1.P	[V2.B16, V3.B16], 32(R0)
+less_than32:
+	TBZ	$4, R3, less_than16
+	LDP.P	16(R1), (R11, R12)
+	LDP.P	16(R2), (R13, R14)
+	EOR	R11, R13, R13
+	EOR	R12, R14, R14
+	STP.P	(R13, R14), 16(R0)
+less_than16:
+	TBZ	$3, R3, less_than8
+	MOVD.P	8(R1), R11
+	MOVD.P	8(R2), R12
+	EOR	R11, R12, R12
+	MOVD.P	R12, 8(R0)
+less_than8:
+	TBZ	$2, R3, less_than4
+	MOVWU.P	4(R1), R13
+	MOVWU.P	4(R2), R14
+	EORW	R13, R14, R14
+	MOVWU.P	R14, 4(R0)
+less_than4:
+	TBZ	$1, R3, less_than2
+	MOVHU.P	2(R1), R15
+	MOVHU.P	2(R2), R16
+	EORW	R15, R16, R16
+	MOVHU.P	R16, 2(R0)
+less_than2:
+	TBZ	$0, R3, end
+	MOVBU	(R1), R17
+	MOVBU	(R2), R19
+	EORW	R17, R19, R19
+	MOVBU	R19, (R0)
+end:
+	RET
diff --git a/src/crypto/cipher/xor_generic.go b/src/crypto/cipher/xor_generic.go
index b7de608..ca9c4bb 100644
--- a/src/crypto/cipher/xor_generic.go
+++ b/src/crypto/cipher/xor_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!ppc64,!ppc64le
+// +build !amd64,!ppc64,!ppc64le,!arm64
 
 package cipher