reflect, runtime: optimize Value.Call on s390x and add benchmark
Use an MVC loop to copy arguments in runtime.call* rather than copying
bytes individually.
I've added the benchmark CallArgCopy to test the speed of Value.Call
for various argument sizes.
name old speed new speed delta
CallArgCopy/size=128 439MB/s ± 1% 582MB/s ± 1% +32.41% (p=0.000 n=10+10)
CallArgCopy/size=256 695MB/s ± 1% 1172MB/s ± 1% +68.67% (p=0.000 n=10+10)
CallArgCopy/size=1024 573MB/s ± 8% 4175MB/s ± 2% +628.11% (p=0.000 n=10+10)
CallArgCopy/size=4096 1.46GB/s ± 2% 10.19GB/s ± 1% +600.52% (p=0.000 n=10+10)
CallArgCopy/size=65536 1.51GB/s ± 0% 12.30GB/s ± 1% +716.30% (p=0.000 n=9+10)
Change-Id: I87dae4809330e7964f6cb4a9e40e5b3254dd519d
Reviewed-on: https://go-review.googlesource.com/28096
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Bill O'Farrell <billotosyr@gmail.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 6d0533a..70e3b5e 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -387,53 +387,55 @@
TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \
NO_LOCAL_POINTERS; \
/* copy arguments to stack */ \
- MOVD arg+16(FP), R3; \
- MOVWZ argsize+24(FP), R4; \
- MOVD R15, R5; \
- ADD $(8-1), R5; \
- SUB $1, R3; \
- ADD R5, R4; \
- CMP R5, R4; \
- BEQ 6(PC); \
- ADD $1, R3; \
- ADD $1, R5; \
- MOVBZ 0(R3), R6; \
- MOVBZ R6, 0(R5); \
- BR -6(PC); \
- /* call function */ \
+ MOVD arg+16(FP), R4; \
+ MOVWZ argsize+24(FP), R5; \
+ MOVD $stack-MAXSIZE(SP), R6; \
+loopArgs: /* copy 256 bytes at a time */ \
+ CMP R5, $256; \
+ BLT tailArgs; \
+ SUB $256, R5; \
+ MVC $256, 0(R4), 0(R6); \
+ MOVD $256(R4), R4; \
+ MOVD $256(R6), R6; \
+ BR loopArgs; \
+tailArgs: /* copy remaining bytes */ \
+ CMP R5, $0; \
+ BEQ callFunction; \
+ SUB $1, R5; \
+ EXRL $callfnMVC<>(SB), R5; \
+callFunction: \
MOVD f+8(FP), R12; \
MOVD (R12), R8; \
PCDATA $PCDATA_StackMapIndex, $0; \
BL (R8); \
/* copy return values back */ \
- MOVD arg+16(FP), R3; \
- MOVWZ n+24(FP), R4; \
- MOVWZ retoffset+28(FP), R6; \
- MOVD R15, R5; \
- ADD R6, R5; \
- ADD R6, R3; \
- SUB R6, R4; \
- ADD $(8-1), R5; \
- SUB $1, R3; \
- ADD R5, R4; \
-loop: \
- CMP R5, R4; \
- BEQ end; \
- ADD $1, R5; \
- ADD $1, R3; \
- MOVBZ 0(R5), R6; \
- MOVBZ R6, 0(R3); \
- BR loop; \
-end: \
+ MOVD arg+16(FP), R6; \
+ MOVWZ n+24(FP), R5; \
+ MOVD $stack-MAXSIZE(SP), R4; \
+ MOVWZ retoffset+28(FP), R1; \
+ ADD R1, R4; \
+ ADD R1, R6; \
+ SUB R1, R5; \
+loopRets: /* copy 256 bytes at a time */ \
+ CMP R5, $256; \
+ BLT tailRets; \
+ SUB $256, R5; \
+ MVC $256, 0(R4), 0(R6); \
+ MOVD $256(R4), R4; \
+ MOVD $256(R6), R6; \
+ BR loopRets; \
+tailRets: /* copy remaining bytes */ \
+ CMP R5, $0; \
+ BEQ writeBarrierUpdates; \
+ SUB $1, R5; \
+ EXRL $callfnMVC<>(SB), R5; \
+writeBarrierUpdates: \
/* execute write barrier updates */ \
- MOVD argtype+0(FP), R7; \
- MOVD arg+16(FP), R3; \
- MOVWZ n+24(FP), R4; \
- MOVWZ retoffset+28(FP), R6; \
- MOVD R7, 8(R15); \
- MOVD R3, 16(R15); \
- MOVD R4, 24(R15); \
- MOVD R6, 32(R15); \
+ MOVD argtype+0(FP), R1; \
+ MOVD arg+16(FP), R2; \
+ MOVWZ n+24(FP), R3; \
+ MOVWZ retoffset+28(FP), R4; \
+ STMG R1, R4, stack-MAXSIZE(SP); \
BL runtime·callwritebarrier(SB); \
RET
@@ -464,6 +466,10 @@
CALLFN(·call536870912, 536870912)
CALLFN(·call1073741824, 1073741824)
+// Not a function: target for EXRL (execute relative long) instruction.
+TEXT callfnMVC<>(SB),NOSPLIT|NOFRAME,$0-0
+ MVC $1, 0(R4), 0(R6)
+
TEXT runtime·procyield(SB),NOSPLIT,$0-0
RET