runtime,runtime/cgo: save callee-saved FP register on arm

Other GOARCHs already handle their callee-saved FP registers, but
arm was missing. Without this change, code using Cgo and floating
point code might fail in mysterious and hard to debug ways.

There are no floating point registers when GOARM=5, so skip the
registers when runtime.goarm < 6.

darwin/arm doesn't support GOARM=5, so the check is left out of
rt0_darwin_arm.s.

Fixes #14876

Change-Id: I6bcb90a76df3664d8ba1f33123a74b1eb2c9f8b2
Reviewed-on: https://go-review.googlesource.com/23140
Run-TryBot: Elias Naur <elias.naur@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Minux Ma <minux@golang.org>
diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
index 08472b6..0f35422 100644
--- a/src/runtime/cgo/asm_arm.s
+++ b/src/runtime/cgo/asm_arm.s
@@ -16,8 +16,40 @@
 	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
 	 *  nevertheless.
 	 */
+	SUB	$(8*9), R13 // Reserve space for the floating point registers.
 	MOVM.WP	[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+
+	// Skip floating point registers on GOARM < 6.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfpsave
+	MOVD	F8, (14*4+8*1)(R13)
+	MOVD	F9, (14*4+8*2)(R13)
+	MOVD	F10, (14*4+8*3)(R13)
+	MOVD	F11, (14*4+8*4)(R13)
+	MOVD	F12, (14*4+8*5)(R13)
+	MOVD	F13, (14*4+8*6)(R13)
+	MOVD	F14, (14*4+8*7)(R13)
+	MOVD	F15, (14*4+8*8)(R13)
+
+skipfpsave:
 	BL	runtime·load_g(SB)
 	MOVW	R15, R14 // R15 is PC.
 	MOVW	0(R13), R15
-	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R15]
+
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfprest
+	MOVD	(14*4+8*1)(R13), F8
+	MOVD	(14*4+8*2)(R13), F9
+	MOVD	(14*4+8*3)(R13), F10
+	MOVD	(14*4+8*4)(R13), F11
+	MOVD	(14*4+8*5)(R13), F12
+	MOVD	(14*4+8*6)(R13), F13
+	MOVD	(14*4+8*7)(R13), F14
+	MOVD	(14*4+8*8)(R13), F15
+
+skipfprest:
+	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14]
+	ADD	$(8*9), R13
+	MOVW	R14, R15
diff --git a/src/runtime/rt0_darwin_arm.s b/src/runtime/rt0_darwin_arm.s
index 59733d3..526d88f 100644
--- a/src/runtime/rt0_darwin_arm.s
+++ b/src/runtime/rt0_darwin_arm.s
@@ -16,7 +16,7 @@
 //
 // Note that all currently shipping darwin/arm platforms require
 // cgo and do not support c-shared.
-TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$32
+TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$104
 	// Preserve callee-save registers.
 	MOVW    R4, 12(R13)
 	MOVW    R5, 16(R13)
@@ -25,6 +25,15 @@
 	MOVW    R8, 28(R13)
 	MOVW    R11, 32(R13)
 
+	MOVD	F8, (32+8*1)(R13)
+	MOVD	F9, (32+8*2)(R13)
+	MOVD	F10, (32+8*3)(R13)
+	MOVD	F11, (32+8*4)(R13)
+	MOVD	F12, (32+8*5)(R13)
+	MOVD	F13, (32+8*6)(R13)
+	MOVD	F14, (32+8*7)(R13)
+	MOVD	F15, (32+8*8)(R13)
+
 	MOVW  R0, _rt0_arm_darwin_lib_argc<>(SB)
 	MOVW  R1, _rt0_arm_darwin_lib_argv<>(SB)
 
@@ -57,6 +66,14 @@
 	MOVW    24(R13), R7
 	MOVW    28(R13), R8
 	MOVW    32(R13), R11
+	MOVD	(32+8*1)(R13), F8
+	MOVD	(32+8*2)(R13), F9
+	MOVD	(32+8*3)(R13), F10
+	MOVD	(32+8*4)(R13), F11
+	MOVD	(32+8*5)(R13), F12
+	MOVD	(32+8*6)(R13), F13
+	MOVD	(32+8*7)(R13), F14
+	MOVD	(32+8*8)(R13), F15
 	RET
 
 
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index a4419b8..597e642 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -12,7 +12,7 @@
 
 // When building with -buildmode=c-shared, this symbol is called when the shared
 // library is loaded.
-TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$32
+TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$104
 	// Preserve callee-save registers. Raspberry Pi's dlopen(), for example,
 	// actually cares that R11 is preserved.
 	MOVW	R4, 12(R13)
@@ -22,6 +22,19 @@
 	MOVW	R8, 28(R13)
 	MOVW	R11, 32(R13)
 
+	// Skip floating point registers on GOARM < 6.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfpsave
+	MOVD	F8, (32+8*1)(R13)
+	MOVD	F9, (32+8*2)(R13)
+	MOVD	F10, (32+8*3)(R13)
+	MOVD	F11, (32+8*4)(R13)
+	MOVD	F12, (32+8*5)(R13)
+	MOVD	F13, (32+8*6)(R13)
+	MOVD	F14, (32+8*7)(R13)
+	MOVD	F15, (32+8*8)(R13)
+skipfpsave:
 	// Save argc/argv.
 	MOVW	R0, _rt0_arm_linux_lib_argc<>(SB)
 	MOVW	R1, _rt0_arm_linux_lib_argv<>(SB)
@@ -46,6 +59,18 @@
 	BL	runtime·newosproc0(SB)
 rr:
 	// Restore callee-save registers and return.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfprest
+	MOVD	(32+8*1)(R13), F8
+	MOVD	(32+8*2)(R13), F9
+	MOVD	(32+8*3)(R13), F10
+	MOVD	(32+8*4)(R13), F11
+	MOVD	(32+8*5)(R13), F12
+	MOVD	(32+8*6)(R13), F13
+	MOVD	(32+8*7)(R13), F14
+	MOVD	(32+8*8)(R13), F15
+skipfprest:
 	MOVW	12(R13), R4
 	MOVW	16(R13), R5
 	MOVW	20(R13), R6