reflect, runtime: optimize Value.Call on s390x and add benchmark

Use an MVC loop to copy arguments in runtime.call* rather than copying
bytes individually.

I've added the benchmark CallArgCopy to test the speed of Value.Call
for various argument sizes.

name                    old speed      new speed       delta
CallArgCopy/size=128     439MB/s ± 1%    582MB/s ± 1%   +32.41%  (p=0.000 n=10+10)
CallArgCopy/size=256     695MB/s ± 1%   1172MB/s ± 1%   +68.67%  (p=0.000 n=10+10)
CallArgCopy/size=1024    573MB/s ± 8%   4175MB/s ± 2%  +628.11%  (p=0.000 n=10+10)
CallArgCopy/size=4096   1.46GB/s ± 2%  10.19GB/s ± 1%  +600.52%  (p=0.000 n=10+10)
CallArgCopy/size=65536  1.51GB/s ± 0%  12.30GB/s ± 1%  +716.30%   (p=0.000 n=9+10)

Change-Id: I87dae4809330e7964f6cb4a9e40e5b3254dd519d
Reviewed-on: https://go-review.googlesource.com/28096
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Bill O'Farrell <billotosyr@gmail.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 6d0533a..70e3b5e 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -387,53 +387,55 @@
 TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVD	arg+16(FP), R3;			\
-	MOVWZ	argsize+24(FP), R4;			\
-	MOVD	R15, R5;				\
-	ADD	$(8-1), R5;			\
-	SUB	$1, R3;				\
-	ADD	R5, R4;				\
-	CMP	R5, R4;				\
-	BEQ	6(PC);				\
-	ADD	$1, R3;				\
-	ADD	$1, R5;				\
-	MOVBZ	0(R3), R6;			\
-	MOVBZ	R6, 0(R5);			\
-	BR	-6(PC);				\
-	/* call function */			\
+	MOVD	arg+16(FP), R4;			\
+	MOVWZ	argsize+24(FP), R5;		\
+	MOVD	$stack-MAXSIZE(SP), R6;		\
+loopArgs: /* copy 256 bytes at a time */	\
+	CMP	R5, $256;			\
+	BLT	tailArgs;			\
+	SUB	$256, R5;			\
+	MVC	$256, 0(R4), 0(R6);		\
+	MOVD	$256(R4), R4;			\
+	MOVD	$256(R6), R6;			\
+	BR	loopArgs;			\
+tailArgs: /* copy remaining bytes */		\
+	CMP	R5, $0;				\
+	BEQ	callFunction;			\
+	SUB	$1, R5;				\
+	EXRL	$callfnMVC<>(SB), R5;		\
+callFunction:					\
 	MOVD	f+8(FP), R12;			\
 	MOVD	(R12), R8;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	BL	(R8);				\
 	/* copy return values back */		\
-	MOVD	arg+16(FP), R3;			\
-	MOVWZ	n+24(FP), R4;			\
-	MOVWZ	retoffset+28(FP), R6;		\
-	MOVD	R15, R5;				\
-	ADD	R6, R5; 			\
-	ADD	R6, R3;				\
-	SUB	R6, R4;				\
-	ADD	$(8-1), R5;			\
-	SUB	$1, R3;				\
-	ADD	R5, R4;				\
-loop:						\
-	CMP	R5, R4;				\
-	BEQ	end;				\
-	ADD	$1, R5;				\
-	ADD	$1, R3;				\
-	MOVBZ	0(R5), R6;			\
-	MOVBZ	R6, 0(R3);			\
-	BR	loop;				\
-end:						\
+	MOVD	arg+16(FP), R6;			\
+	MOVWZ	n+24(FP), R5;			\
+	MOVD	$stack-MAXSIZE(SP), R4;		\
+	MOVWZ	retoffset+28(FP), R1;		\
+	ADD	R1, R4;				\
+	ADD	R1, R6;				\
+	SUB	R1, R5;				\
+loopRets: /* copy 256 bytes at a time */	\
+	CMP	R5, $256;			\
+	BLT	tailRets;			\
+	SUB	$256, R5;			\
+	MVC	$256, 0(R4), 0(R6);		\
+	MOVD	$256(R4), R4;			\
+	MOVD	$256(R6), R6;			\
+	BR	loopRets;			\
+tailRets: /* copy remaining bytes */		\
+	CMP	R5, $0;				\
+	BEQ	writeBarrierUpdates;		\
+	SUB	$1, R5;				\
+	EXRL	$callfnMVC<>(SB), R5;		\
+writeBarrierUpdates:				\
 	/* execute write barrier updates */	\
-	MOVD	argtype+0(FP), R7;		\
-	MOVD	arg+16(FP), R3;			\
-	MOVWZ	n+24(FP), R4;			\
-	MOVWZ	retoffset+28(FP), R6;		\
-	MOVD	R7, 8(R15);			\
-	MOVD	R3, 16(R15);			\
-	MOVD	R4, 24(R15);			\
-	MOVD	R6, 32(R15);			\
+	MOVD	argtype+0(FP), R1;		\
+	MOVD	arg+16(FP), R2;			\
+	MOVWZ	n+24(FP), R3;			\
+	MOVWZ	retoffset+28(FP), R4;		\
+	STMG	R1, R4, stack-MAXSIZE(SP);	\
 	BL	runtime·callwritebarrier(SB);	\
 	RET
 
@@ -464,6 +466,10 @@
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
+// Not a function: target for EXRL (execute relative long) instruction.
+TEXT callfnMVC<>(SB),NOSPLIT|NOFRAME,$0-0
+	MVC	$1, 0(R4), 0(R6)
+
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	RET