runtime/cgo: save correct floating point registers on s390x

When transitioning from C code to Go code we must respect the C
calling convention. On s390x this means that r6-r13, r15 and f8-f15
must be saved and restored by functions that use them.

On s390x we were saving the wrong set of floating point registers
(f0, f2, f4 and f6) rather than f8-f15 which means that Go code
could clobber registers that C code expects to be restored. This
CL modifies the crosscall functions on s390x to save/restore the
correct floating point registers.

Fixes #18035.

Change-Id: I5cc6f552c893a4e677669c8891521bf735492e97
Reviewed-on: https://go-review.googlesource.com/33571
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 026c18c..c2212a5 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -1305,13 +1305,9 @@
 // However, since this function is only called once per loaded module
 // performance is unimportant.
 TEXT runtime·addmoduledata(SB),NOSPLIT|NOFRAME,$0-0
-	// Save R6-R15, F0, F2, F4 and F6 in the
-	// register save area of the calling function
+	// Save R6-R15 in the register save area of the calling function.
+	// Don't bother saving F8-F15 as we aren't doing any calls.
 	STMG	R6, R15, 48(R15)
-	FMOVD	F0, 128(R15)
-	FMOVD	F2, 136(R15)
-	FMOVD	F4, 144(R15)
-	FMOVD	F6, 152(R15)
 
 	// append the argument (passed in R2, as per the ELF ABI) to the
 	// moduledata linked list.
@@ -1319,12 +1315,8 @@
 	MOVD	R2, moduledata_next(R1)
 	MOVD	R2, runtime·lastmoduledatap(SB)
 
-	// Restore R6-R15, F0, F2, F4 and F6
+	// Restore R6-R15.
 	LMG	48(R15), R6, R15
-	FMOVD	F0, 128(R15)
-	FMOVD	F2, 136(R15)
-	FMOVD	F4, 144(R15)
-	FMOVD	F6, 152(R15)
 	RET
 
 TEXT ·checkASM(SB),NOSPLIT,$0-1
diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
index ae688b6..7eab8f6 100644
--- a/src/runtime/cgo/asm_s390x.s
+++ b/src/runtime/cgo/asm_s390x.s
@@ -8,36 +8,46 @@
 // func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
 // Saves C callee-saved registers and calls fn with three arguments.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
-	// Start with standard C stack frame layout and linkage
+	// Start with standard C stack frame layout and linkage.
 
-	// Save R6-R15, F0, F2, F4 and F6 in the
-	// register save area of the calling function
+	// Save R6-R15 in the register save area of the calling function.
 	STMG	R6, R15, 48(R15)
-	FMOVD	F0, 128(R15)
-	FMOVD	F2, 136(R15)
-	FMOVD	F4, 144(R15)
-	FMOVD	F6, 152(R15)
 
-	// Initialize Go ABI environment
-	XOR	R0, R0
+	// Allocate 96 bytes on the stack.
+	MOVD	$-96(R15), R15
+
+	// Save F8-F15 in our stack frame.
+	FMOVD	F8, 32(R15)
+	FMOVD	F9, 40(R15)
+	FMOVD	F10, 48(R15)
+	FMOVD	F11, 56(R15)
+	FMOVD	F12, 64(R15)
+	FMOVD	F13, 72(R15)
+	FMOVD	F14, 80(R15)
+	FMOVD	F15, 88(R15)
+
+	// Initialize Go ABI environment.
 	BL	runtime·load_g(SB)
 
-	// Allocate 32 bytes on the stack
-	SUB	$32, R15
-
 	MOVD	R3, 8(R15)  // arg1
 	MOVW	R4, 16(R15) // arg2
 	MOVD	R5, 24(R15) // arg3
 	BL	(R2)        // fn(arg1, arg2, arg3)
 
-	ADD	$32, R15
+	FMOVD	32(R15), F8
+	FMOVD	40(R15), F9
+	FMOVD	48(R15), F10
+	FMOVD	56(R15), F11
+	FMOVD	64(R15), F12
+	FMOVD	72(R15), F13
+	FMOVD	80(R15), F14
+	FMOVD	88(R15), F15
 
-	// Restore R6-R15, F0, F2, F4 and F6
+	// De-allocate stack frame.
+	MOVD	$96(R15), R15
+
+	// Restore R6-R15.
 	LMG	48(R15), R6, R15
-	FMOVD	F0, 128(R15)
-	FMOVD	F2, 136(R15)
-	FMOVD	F4, 144(R15)
-	FMOVD	F6, 152(R15)
 
 	RET
 
diff --git a/src/runtime/cgo/gcc_s390x.S b/src/runtime/cgo/gcc_s390x.S
index db654e4..614de4b 100644
--- a/src/runtime/cgo/gcc_s390x.S
+++ b/src/runtime/cgo/gcc_s390x.S
@@ -6,38 +6,48 @@
  * void crosscall_s390x(void (*fn)(void), void *g)
  *
  * Calling into the go tool chain, where all registers are caller save.
- * Called from standard s390x C ABI, where r6-r13, r15, and f0, f2, f4 and f6 are
+ * Called from standard s390x C ABI, where r6-r13, r15, and f8-f15 are
  * callee-save, so they must be saved explicitly.
  */
 .globl crosscall_s390x
 crosscall_s390x:
-	/*
-	 * save r6-r15, f0, f2, f4 and f6 in the
-	 * register save area of the calling function
-	 */
-	stmg	%r6, %r15, 48(%r15)
-	stdy	%f0, 128(%r15)
-	stdy	%f2, 136(%r15)
-	stdy	%f4, 144(%r15)
-	stdy	%f6, 152(%r15)
+	/* save r6-r15 in the register save area of the calling function */
+	stmg    %r6, %r15, 48(%r15)
 
-	/* set r0 to 0 */
-	xgr	%r0, %r0
+	/* allocate 64 bytes of stack space to save f8-f15 */
+	lay     %r15, -64(%r15)
+
+	/* save callee-saved floating point registers */
+	std     %f8, 0(%r15)
+	std     %f9, 8(%r15)
+	std     %f10, 16(%r15)
+	std     %f11, 24(%r15)
+	std     %f12, 32(%r15)
+	std     %f13, 40(%r15)
+	std     %f14, 48(%r15)
+	std     %f15, 56(%r15)
 
 	/* restore g pointer */
-	lgr	%r13, %r3
+	lgr     %r13, %r3
 
-	/* grow stack 8 bytes and call fn */
-	agfi    %r15, -8
+	/* call fn */
 	basr    %r14, %r2
-	agfi	%r15, 8
 
-	/* restore registers */
-	lmg	%r6, %r15, 48(%r15)
-	ldy	%f0, 128(%r15)
-	ldy	%f2, 136(%r15)
-	ldy	%f4, 144(%r15)
-	ldy	%f6, 152(%r15)
+	/* restore floating point registers */
+	ld      %f8, 0(%r15)
+	ld      %f9, 8(%r15)
+	ld      %f10, 16(%r15)
+	ld      %f11, 24(%r15)
+	ld      %f12, 32(%r15)
+	ld      %f13, 40(%r15)
+	ld      %f14, 48(%r15)
+	ld      %f15, 56(%r15)
+
+	/* de-allocate stack frame */
+	la      %r15, 64(%r15)
+
+	/* restore general purpose registers */
+	lmg     %r6, %r15, 48(%r15)
 
 	br      %r14 /* restored by lmg */