runtime: improve memmove for ppc64x This improves performance of memmove for larger moves by unrolling the main loop from 32 byte to 64 byte moves. The improvement of the relevant sizes on a power9: Memmove/64 5.11ns ± 0% 5.00ns ± 0% -2.21% Memmove/128 8.26ns ± 0% 5.88ns ± 0% -28.83% Memmove/256 12.7ns ± 0% 8.6ns ± 0% -31.94% Memmove/512 17.9ns ± 0% 14.3ns ± 0% -19.87% Memmove/1024 33.3ns ± 0% 27.0ns ± 0% -18.92% Memmove/2048 72.1ns ± 0% 51.8ns ± 0% -28.25% Memmove/4096 126ns ± 0% 110ns ± 0% -12.63% Change-Id: I74162a9f152d7752a8281da1b89a66da99a3fdc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/399499 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com>

commit: 9ccf5b8e86ce98494a2127196fbc47d72b0a71a5 [log] [tgz]
author: Lynn Boger <laboger@linux.vnet.ibm.com> Fri Apr 08 13:50:00 2022 -0500
committer: Lynn Boger <laboger@linux.vnet.ibm.com> Tue Apr 12 13:54:06 2022 +0000
tree: daf3fee03b35231a1902eace1b9a0537639903eb
parent: d6320f1a58f1f7820daee06a086c83a0274a777f [diff]
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 25101a2..5fa51c0 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s

@@ -24,8 +24,12 @@
 #define IDX16 R8
 // temp used for copies, etc.
 #define TMP R9
-// number of 32 byte chunks
+// number of 64 byte chunks
 #define QWORDS R10
+// index values
+#define IDX32 R14
+#define IDX48 R15
+#define OCTWORDS R16
 
 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
 	// R3 = TGT = to
@@ -52,28 +56,46 @@
 	// Copying forward if no overlap.
 
 	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
-	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
-	BEQ	lt32gt8			// < 32 bytes
+	SRDCC	$3, DWORDS, OCTWORDS	// 64 byte chunks?
+	MOVD	$16, IDX16
+	BEQ	lt64gt8			// < 64 bytes
 
-	// Prepare for moves of 32 bytes at a time.
+	// Prepare for moves of 64 bytes at a time.
 
-forward32setup:
+forward64setup:
 	DCBTST	(TGT)			// prepare data cache
 	DCBT	(SRC)
-	MOVD	QWORDS, CTR		// Number of 32 byte chunks
-	MOVD	$16, IDX16		// 16 for index
+	MOVD	OCTWORDS, CTR		// Number of 64 byte chunks
+	MOVD	$32, IDX32
+	MOVD	$48, IDX48
+	PCALIGN	$32
 
-forward32:
-	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
-	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
-	ADD	$32, SRC
-	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
+forward64:
+	LXVD2X	(R0)(SRC), VS32		// load 64 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	LXVD2X	(IDX32)(SRC), VS34
+	LXVD2X	(IDX48)(SRC), VS35
+	ADD	$64, SRC
+	STXVD2X	VS32, (R0)(TGT)		// store 64 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	ADD	$32,TGT			// bump up for next set
-	BC	16, 0, forward32	// continue
-	ANDCC	$3, DWORDS		// remaining doublewords
+	STXVD2X	VS34, (IDX32)(TGT)
+	STXVD2X VS35, (IDX48)(TGT)
+	ADD	$64,TGT			// bump up for next set
+	BC	16, 0, forward64	// continue
+	ANDCC	$7, DWORDS		// remaining doublewords
 	BEQ	checkbytes		// only bytes remain
 
+lt64gt8:
+	CMP	DWORDS, $4
+	BLT	lt32gt8
+	LXVD2X	(R0)(SRC), VS32
+	LXVD2X	(IDX16)(SRC), VS33
+	ADD	$-4, DWORDS
+	STXVD2X	VS32, (R0)(TGT)
+	STXVD2X	VS33, (IDX16)(TGT)
+	ADD	$32, SRC
+	ADD	$32, TGT
+
 lt32gt8:
         // At this point >= 8 and < 32
 	// Move 16 bytes if possible
@@ -134,7 +156,7 @@
 	SUB	$1,SRC
 	MOVBZ 	TMP, -1(TGT)
 	SUB	$1,TGT
-	BC	16, 0, backwardtailloop // bndz
+	BDNZ	backwardtailloop
 
 nobackwardtail:
 	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
@@ -169,6 +191,6 @@
 	LXVD2X	(IDX16)(SRC), VS33
 	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	BC      16, 0, backward32loop	// bndz
+	BDNZ	backward32loop
 	BC	12, 2, LR		// beqlr, return if DWORDS == 0
 	BR	backward24
commit	9ccf5b8e86ce98494a2127196fbc47d72b0a71a5	[log] [tgz]
author	Lynn Boger <laboger@linux.vnet.ibm.com>	Fri Apr 08 13:50:00 2022 -0500
committer	Lynn Boger <laboger@linux.vnet.ibm.com>	Tue Apr 12 13:54:06 2022 +0000
tree	daf3fee03b35231a1902eace1b9a0537639903eb
parent	d6320f1a58f1f7820daee06a086c83a0274a777f [diff]