runtime: improve memmove for ppc64x

This improves performance of memmove for larger moves by
unrolling the main loop from 32 byte to 64 byte moves.

The improvement of the relevant sizes on a power9:

Memmove/64      5.11ns ± 0%    5.00ns ± 0%   -2.21%
Memmove/128     8.26ns ± 0%    5.88ns ± 0%  -28.83%
Memmove/256     12.7ns ± 0%     8.6ns ± 0%  -31.94%
Memmove/512     17.9ns ± 0%    14.3ns ± 0%  -19.87%
Memmove/1024    33.3ns ± 0%    27.0ns ± 0%  -18.92%
Memmove/2048    72.1ns ± 0%    51.8ns ± 0%  -28.25%
Memmove/4096     126ns ± 0%     110ns ± 0%  -12.63%


Change-Id: I74162a9f152d7752a8281da1b89a66da99a3fdc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/399499
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 25101a2..5fa51c0 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -24,8 +24,12 @@
 #define IDX16 R8
 // temp used for copies, etc.
 #define TMP R9
-// number of 32 byte chunks
+// number of 64 byte chunks
 #define QWORDS R10
+// index values
+#define IDX32 R14
+#define IDX48 R15
+#define OCTWORDS R16
 
 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
 	// R3 = TGT = to
@@ -52,28 +56,46 @@
 	// Copying forward if no overlap.
 
 	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
-	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
-	BEQ	lt32gt8			// < 32 bytes
+	SRDCC	$3, DWORDS, OCTWORDS	// 64 byte chunks?
+	MOVD	$16, IDX16
+	BEQ	lt64gt8			// < 64 bytes
 
-	// Prepare for moves of 32 bytes at a time.
+	// Prepare for moves of 64 bytes at a time.
 
-forward32setup:
+forward64setup:
 	DCBTST	(TGT)			// prepare data cache
 	DCBT	(SRC)
-	MOVD	QWORDS, CTR		// Number of 32 byte chunks
-	MOVD	$16, IDX16		// 16 for index
+	MOVD	OCTWORDS, CTR		// Number of 64 byte chunks
+	MOVD	$32, IDX32
+	MOVD	$48, IDX48
+	PCALIGN	$32
 
-forward32:
-	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
-	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
-	ADD	$32, SRC
-	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
+forward64:
+	LXVD2X	(R0)(SRC), VS32		// load 64 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	LXVD2X	(IDX32)(SRC), VS34
+	LXVD2X	(IDX48)(SRC), VS35
+	ADD	$64, SRC
+	STXVD2X	VS32, (R0)(TGT)		// store 64 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	ADD	$32,TGT			// bump up for next set
-	BC	16, 0, forward32	// continue
-	ANDCC	$3, DWORDS		// remaining doublewords
+	STXVD2X	VS34, (IDX32)(TGT)
+	STXVD2X VS35, (IDX48)(TGT)
+	ADD	$64,TGT			// bump up for next set
+	BC	16, 0, forward64	// continue
+	ANDCC	$7, DWORDS		// remaining doublewords
 	BEQ	checkbytes		// only bytes remain
 
+lt64gt8:
+	CMP	DWORDS, $4
+	BLT	lt32gt8
+	LXVD2X	(R0)(SRC), VS32
+	LXVD2X	(IDX16)(SRC), VS33
+	ADD	$-4, DWORDS
+	STXVD2X	VS32, (R0)(TGT)
+	STXVD2X	VS33, (IDX16)(TGT)
+	ADD	$32, SRC
+	ADD	$32, TGT
+
 lt32gt8:
         // At this point >= 8 and < 32
 	// Move 16 bytes if possible
@@ -134,7 +156,7 @@
 	SUB	$1,SRC
 	MOVBZ 	TMP, -1(TGT)
 	SUB	$1,TGT
-	BC	16, 0, backwardtailloop // bndz
+	BDNZ	backwardtailloop
 
 nobackwardtail:
 	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
@@ -169,6 +191,6 @@
 	LXVD2X	(IDX16)(SRC), VS33
 	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	BC      16, 0, backward32loop	// bndz
+	BDNZ	backward32loop
 	BC	12, 2, LR		// beqlr, return if DWORDS == 0
 	BR	backward24