runtime: improve memmove for ppc64x
This improves performance of memmove for larger moves by
unrolling the main loop from 32 byte to 64 byte moves.
The improvement of the relevant sizes on a power9:
Memmove/64 5.11ns ± 0% 5.00ns ± 0% -2.21%
Memmove/128 8.26ns ± 0% 5.88ns ± 0% -28.83%
Memmove/256 12.7ns ± 0% 8.6ns ± 0% -31.94%
Memmove/512 17.9ns ± 0% 14.3ns ± 0% -19.87%
Memmove/1024 33.3ns ± 0% 27.0ns ± 0% -18.92%
Memmove/2048 72.1ns ± 0% 51.8ns ± 0% -28.25%
Memmove/4096 126ns ± 0% 110ns ± 0% -12.63%
Change-Id: I74162a9f152d7752a8281da1b89a66da99a3fdc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/399499
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 25101a2..5fa51c0 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -24,8 +24,12 @@
#define IDX16 R8
// temp used for copies, etc.
#define TMP R9
-// number of 32 byte chunks
+// number of 64 byte chunks
#define QWORDS R10
+// index values
+#define IDX32 R14
+#define IDX48 R15
+#define OCTWORDS R16
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
// R3 = TGT = to
@@ -52,28 +56,46 @@
// Copying forward if no overlap.
BC 12, 6, checkbytes // BEQ CR1, checkbytes
- SRDCC $2, DWORDS, QWORDS // 32 byte chunks?
- BEQ lt32gt8 // < 32 bytes
+ SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks?
+ MOVD $16, IDX16
+ BEQ lt64gt8 // < 64 bytes
- // Prepare for moves of 32 bytes at a time.
+ // Prepare for moves of 64 bytes at a time.
-forward32setup:
+forward64setup:
DCBTST (TGT) // prepare data cache
DCBT (SRC)
- MOVD QWORDS, CTR // Number of 32 byte chunks
- MOVD $16, IDX16 // 16 for index
+ MOVD OCTWORDS, CTR // Number of 64 byte chunks
+ MOVD $32, IDX32
+ MOVD $48, IDX48
+ PCALIGN $32
-forward32:
- LXVD2X (R0)(SRC), VS32 // load 16 bytes
- LXVD2X (IDX16)(SRC), VS33 // load 16 bytes
- ADD $32, SRC
- STXVD2X VS32, (R0)(TGT) // store 16 bytes
+forward64:
+ LXVD2X (R0)(SRC), VS32 // load 64 bytes
+ LXVD2X (IDX16)(SRC), VS33
+ LXVD2X (IDX32)(SRC), VS34
+ LXVD2X (IDX48)(SRC), VS35
+ ADD $64, SRC
+ STXVD2X VS32, (R0)(TGT) // store 64 bytes
STXVD2X VS33, (IDX16)(TGT)
- ADD $32,TGT // bump up for next set
- BC 16, 0, forward32 // continue
- ANDCC $3, DWORDS // remaining doublewords
+ STXVD2X VS34, (IDX32)(TGT)
+ STXVD2X VS35, (IDX48)(TGT)
+ ADD $64,TGT // bump up for next set
+ BC 16, 0, forward64 // continue
+ ANDCC $7, DWORDS // remaining doublewords
BEQ checkbytes // only bytes remain
+lt64gt8:
+ CMP DWORDS, $4
+ BLT lt32gt8
+ LXVD2X (R0)(SRC), VS32
+ LXVD2X (IDX16)(SRC), VS33
+ ADD $-4, DWORDS
+ STXVD2X VS32, (R0)(TGT)
+ STXVD2X VS33, (IDX16)(TGT)
+ ADD $32, SRC
+ ADD $32, TGT
+
lt32gt8:
// At this point >= 8 and < 32
// Move 16 bytes if possible
@@ -134,7 +156,7 @@
SUB $1,SRC
MOVBZ TMP, -1(TGT)
SUB $1,TGT
- BC 16, 0, backwardtailloop // bndz
+ BDNZ backwardtailloop
nobackwardtail:
BC 4, 5, LR // blelr cr1, return if DWORDS == 0
@@ -169,6 +191,6 @@
LXVD2X (IDX16)(SRC), VS33
STXVD2X VS32, (R0)(TGT) // store 16x2 bytes
STXVD2X VS33, (IDX16)(TGT)
- BC 16, 0, backward32loop // bndz
+ BDNZ backward32loop
BC 12, 2, LR // beqlr, return if DWORDS == 0
BR backward24