x/crypto/chacha20: cleanup chacha_ppc64le.s

- Adding PCALIGN before the loops
- Changing WORD directive with corresponding Vector Merge EVEN/ODD word  instructions
- Replacing Branch Conditional (BC) with its extended mnemonic form  BDNZ
- VPERMXOR instruction usage in place of VXOR instructions followed by
  VRLW (rotate left) for cases of rotating in multiples of 8. This
  replacements give performace improvement both in time and space  of around 7%-8% as listed below
  using benchstat tool.

goos: linux
goarch: ppc64le
pkg: golang.org/x/crypto/chacha20
cpu: POWER10
                 | chacha20.prev.out |       chacha20.new.out            |
                 |      sec/op       |   sec/op     vs base              |
ChaCha20/64              171.9n ± 0%   156.6n ± 1%  -8.90% (p=0.002 n=6)
ChaCha20/256             165.5n ± 0%   152.4n ± 0%  -7.92% (p=0.002 n=6)
ChaCha20/10x25           505.8n ± 0%   504.3n ± 2%  -0.32% (p=0.589 n=6)
ChaCha20/4096            2.265µ ± 0%   2.052µ ± 0%  -9.40% (p=0.002 n=6)
ChaCha20/100x40          5.359µ ± 3%   5.018µ ± 2%  -6.37% (p=0.002 n=6)
ChaCha20/65536           35.71µ ± 0%   32.29µ ± 0%  -9.57% (p=0.002 n=6)
ChaCha20/1000x65         44.63µ ± 0%   41.05µ ± 0%  -8.02% (p=0.002 n=6)
geomean                  2.235µ        2.073µ       -7.26%

                 | chacha20.prev.out |          chacha20.new.out         |
                 |       B/s         |     B/s       vs base             |
ChaCha20/64             355.1Mi ± 0%   389.8Mi ± 1%   +9.78% (p=0.002 n=6)
ChaCha20/256            1.440Gi ± 0%   1.565Gi ± 0%   +8.62% (p=0.002 n=6)
ChaCha20/10x25          471.3Mi ± 0%   472.8Mi ± 2%   +0.31% (p=0.589 n=6)
ChaCha20/4096           1.684Gi ± 0%   1.859Gi ± 0%  +10.38% (p=0.002 n=6)
ChaCha20/100x40         711.8Mi ± 3%   760.3Mi ± 2%   +6.80% (p=0.002 n=6)
ChaCha20/65536          1.709Gi ± 0%   1.890Gi ± 0%  +10.59% (p=0.002 n=6)
ChaCha20/1000x65        1.356Gi ± 0%   1.475Gi ± 0%   +8.72% (p=0.002 n=6)
geomean                 957.3Mi        1.008Gi        +7.83%
Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s
index 66aebae..c672ccf 100644
--- a/chacha20/chacha_ppc64le.s
+++ b/chacha20/chacha_ppc64le.s
@@ -33,6 +33,9 @@
 #define CONSTBASE  R16
 #define BLOCKS R17
 
+// for VPERMXOR
+#define MASK  R18
+
 DATA consts<>+0x00(SB)/8, $0x3320646e61707865
 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
 DATA consts<>+0x10(SB)/8, $0x0000000000000001
@@ -53,7 +56,11 @@
 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
 DATA consts<>+0x90(SB)/8, $0x0000000100000000
 DATA consts<>+0x98(SB)/8, $0x0000000300000002
-GLOBL consts<>(SB), RODATA, $0xa0
+DATA consts<>+0xa0(SB)/8, $0x5566774411223300
+DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
+DATA consts<>+0xb0(SB)/8, $0x6677445522330011
+DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
+GLOBL consts<>(SB), RODATA, $0xc0
 
 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
@@ -70,6 +77,9 @@
 	MOVD $48, R10
 	MOVD $64, R11
 	SRD $6, LEN, BLOCKS
+	// for VPERMXOR
+	MOVD $consts<>+0xa0(SB), MASK
+	MOVD $16, R20
 	// V16
 	LXVW4X (CONSTBASE)(R0), VS48
 	ADD $80,CONSTBASE
@@ -87,6 +97,10 @@
 	// V28
 	LXVW4X (CONSTBASE)(R11), VS60
 
+	// Load mask constants for VPERMXOR
+	LXVW4X (MASK)(R0), V20
+	LXVW4X (MASK)(R20), V21
+
 	// splat slot from V19 -> V26
 	VSPLTW $0, V19, V26
 
@@ -97,7 +111,7 @@
 
 	MOVD $10, R14
 	MOVD R14, CTR
-
+	PCALIGN $16
 loop_outer_vsx:
 	// V0, V1, V2, V3
 	LXVW4X (R0)(CONSTBASE), VS32
@@ -128,22 +142,17 @@
 	VSPLTISW $12, V28
 	VSPLTISW $8, V29
 	VSPLTISW $7, V30
-
+	PCALIGN $16
 loop_vsx:
 	VADDUWM V0, V4, V0
 	VADDUWM V1, V5, V1
 	VADDUWM V2, V6, V2
 	VADDUWM V3, V7, V3
 
-	VXOR V12, V0, V12
-	VXOR V13, V1, V13
-	VXOR V14, V2, V14
-	VXOR V15, V3, V15
-
-	VRLW V12, V27, V12
-	VRLW V13, V27, V13
-	VRLW V14, V27, V14
-	VRLW V15, V27, V15
+	VPERMXOR V12, V0, V21, V12
+	VPERMXOR V13, V1, V21, V13
+	VPERMXOR V14, V2, V21, V14
+	VPERMXOR V15, V3, V21, V15
 
 	VADDUWM V8, V12, V8
 	VADDUWM V9, V13, V9
@@ -165,15 +174,10 @@
 	VADDUWM V2, V6, V2
 	VADDUWM V3, V7, V3
 
-	VXOR V12, V0, V12
-	VXOR V13, V1, V13
-	VXOR V14, V2, V14
-	VXOR V15, V3, V15
-
-	VRLW V12, V29, V12
-	VRLW V13, V29, V13
-	VRLW V14, V29, V14
-	VRLW V15, V29, V15
+	VPERMXOR V12, V0, V20, V12
+	VPERMXOR V13, V1, V20, V13
+	VPERMXOR V14, V2, V20, V14
+	VPERMXOR V15, V3, V20, V15
 
 	VADDUWM V8, V12, V8
 	VADDUWM V9, V13, V9
@@ -195,15 +199,10 @@
 	VADDUWM V2, V7, V2
 	VADDUWM V3, V4, V3
 
-	VXOR V15, V0, V15
-	VXOR V12, V1, V12
-	VXOR V13, V2, V13
-	VXOR V14, V3, V14
-
-	VRLW V15, V27, V15
-	VRLW V12, V27, V12
-	VRLW V13, V27, V13
-	VRLW V14, V27, V14
+	VPERMXOR V15, V0, V21, V15
+	VPERMXOR V12, V1, V21, V12
+	VPERMXOR V13, V2, V21, V13
+	VPERMXOR V14, V3, V21, V14
 
 	VADDUWM V10, V15, V10
 	VADDUWM V11, V12, V11
@@ -225,15 +224,10 @@
 	VADDUWM V2, V7, V2
 	VADDUWM V3, V4, V3
 
-	VXOR V15, V0, V15
-	VXOR V12, V1, V12
-	VXOR V13, V2, V13
-	VXOR V14, V3, V14
-
-	VRLW V15, V29, V15
-	VRLW V12, V29, V12
-	VRLW V13, V29, V13
-	VRLW V14, V29, V14
+	VPERMXOR V15, V0, V20, V15
+	VPERMXOR V12, V1, V20, V12
+	VPERMXOR V13, V2, V20, V13
+	VPERMXOR V14, V3, V20, V14
 
 	VADDUWM V10, V15, V10
 	VADDUWM V11, V12, V11
@@ -249,48 +243,48 @@
 	VRLW V6, V30, V6
 	VRLW V7, V30, V7
 	VRLW V4, V30, V4
-	BC   16, LT, loop_vsx
+	BDNZ   loop_vsx
 
 	VADDUWM V12, V26, V12
 
-	WORD $0x13600F8C		// VMRGEW V0, V1, V27
-	WORD $0x13821F8C		// VMRGEW V2, V3, V28
+	VMRGEW V0, V1, V27
+	VMRGEW V2, V3, V28
 
-	WORD $0x10000E8C		// VMRGOW V0, V1, V0
-	WORD $0x10421E8C		// VMRGOW V2, V3, V2
+	VMRGOW V0, V1, V0
+	VMRGOW V2, V3, V2
 
-	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
-	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
+	VMRGEW V4, V5, V29
+	VMRGEW V6, V7, V30
 
 	XXPERMDI VS32, VS34, $0, VS33
 	XXPERMDI VS32, VS34, $3, VS35
 	XXPERMDI VS59, VS60, $0, VS32
 	XXPERMDI VS59, VS60, $3, VS34
 
-	WORD $0x10842E8C		// VMRGOW V4, V5, V4
-	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
+	VMRGOW V4, V5, V4
+	VMRGOW V6, V7, V6
 
-	WORD $0x13684F8C		// VMRGEW V8, V9, V27
-	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
+	VMRGEW V8, V9, V27
+	VMRGEW V10, V11, V28
 
 	XXPERMDI VS36, VS38, $0, VS37
 	XXPERMDI VS36, VS38, $3, VS39
 	XXPERMDI VS61, VS62, $0, VS36
 	XXPERMDI VS61, VS62, $3, VS38
 
-	WORD $0x11084E8C		// VMRGOW V8, V9, V8
-	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
+	VMRGOW V8, V9, V8
+	VMRGOW V10, V11, V10
 
-	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
-	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
+	VMRGEW V12, V13, V29
+	VMRGEW V14, V15, V30
 
 	XXPERMDI VS40, VS42, $0, VS41
 	XXPERMDI VS40, VS42, $3, VS43
 	XXPERMDI VS59, VS60, $0, VS40
 	XXPERMDI VS59, VS60, $3, VS42
 
-	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
-	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
+	VMRGOW V12, V13, V12
+	VMRGOW V14, V15, V14
 
 	VSPLTISW $4, V27
 	VADDUWM V26, V27, V26
@@ -431,7 +425,7 @@
 	ADD $-1, R11, R12
 	ADD $-1, INP
 	ADD $-1, OUT
-
+	PCALIGN $16
 looptail_vsx:
 	// Copying the result to OUT
 	// in bytes.
@@ -439,7 +433,7 @@
 	MOVBZU 1(INP), TMP
 	XOR    KEY, TMP, KEY
 	MOVBU  KEY, 1(OUT)
-	BC     16, LT, looptail_vsx
+	BDNZ   looptail_vsx
 
 	// Clear the stack values
 	STXVW4X VS48, (R11)(R0)