runtime: shared library runtime init for arm

Adds the runtime initialization flow for arm akin to amd64.
In particular,we use the library initialization entry point to:
    - create a new OS thread and run the "regular" runtime init stack on
      that thread
    - return immediately from the main (i.e., loader) thread
    - at the first CGO invocation, we wait for the runtime initialization
      to complete.

Verified to work on a Raspberry Pi and an Android phone.

Change-Id: I32f39228ae30a03ce9569287f234b305790fecf6
Reviewed-on: https://go-review.googlesource.com/8455
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Run-TryBot: Srdjan Petrovic <spetrovic@google.com>
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index 15c1092..cd79619 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -10,6 +10,58 @@
 	MOVW	$_rt0_arm_linux1(SB), R4
 	B		(R4)
 
+TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$40
+	// Preserve callee-save registers.  Raspberry Pi's dlopen(), for example,
+	// actually cares that R11 is preserved.
+	MOVW	R4, 16(R13)
+	MOVW	R5, 20(R13)
+	MOVW	R6, 24(R13)
+	MOVW	R7, 28(R13)
+	MOVW	R8, 32(R13)
+	MOVW	R11, 36(R13)
+
+	// Save argc/argv.
+	MOVW	R0, _rt0_arm_linux_lib_argc<>(SB)
+	MOVW	R1, _rt0_arm_linux_lib_argv<>(SB)
+
+	// Create a new thread to do the runtime initialization.
+	MOVW	_cgo_sys_thread_create(SB), R2
+	CMP	$0, R2
+	BEQ	nocgo
+	MOVW	$_rt0_arm_linux_lib_go<>(SB), R0
+	MOVW	$0, R1
+	BL	(R2)
+	B	rr
+nocgo:
+	MOVW	$0x800000, R0                     // stacksize = 8192KB
+	MOVW	$_rt0_arm_linux_lib_go<>(SB), R1  // fn
+	MOVW	$0, R2                            // fnarg
+	MOVW	R0, 4(R13)
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	BL	runtime·newosproc0(SB)
+rr:
+	// Restore callee-save registers and return.
+	MOVW	16(R13), R4
+	MOVW	20(R13), R5
+	MOVW	24(R13), R6
+	MOVW	28(R13), R7
+	MOVW	32(R13), R8
+	MOVW	36(R13), R11
+	RET
+
+TEXT _rt0_arm_linux_lib_go<>(SB),NOSPLIT,$8
+	MOVW	_rt0_arm_linux_lib_argc<>(SB), R0
+	MOVW	_rt0_arm_linux_lib_argv<>(SB), R1
+	MOVW	R0, 0(R13)
+	MOVW	R1, 4(R13)
+	B	runtime·rt0_go(SB)
+
+DATA _rt0_arm_linux_lib_argc<>(SB)/4,$0
+GLOBL _rt0_arm_linux_lib_argc<>(SB),NOPTR,$4
+DATA _rt0_arm_linux_lib_argv<>(SB)/4,$0
+GLOBL _rt0_arm_linux_lib_argv<>(SB),NOPTR,$4
+
 TEXT _rt0_arm_linux1(SB),NOSPLIT,$-4
 	// We first need to detect the kernel ABI, and warn the user
 	// if the system only supports OABI