runtime: shared library runtime init for arm

Adds the runtime initialization flow for arm akin to amd64.
In particular,we use the library initialization entry point to:
    - create a new OS thread and run the "regular" runtime init stack on
      that thread
    - return immediately from the main (i.e., loader) thread
    - at the first CGO invocation, we wait for the runtime initialization
      to complete.

Verified to work on a Raspberry Pi and an Android phone.

Change-Id: I32f39228ae30a03ce9569287f234b305790fecf6
Reviewed-on: https://go-review.googlesource.com/8455
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Run-TryBot: Srdjan Petrovic <spetrovic@google.com>
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index 15c1092..cd79619 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -10,6 +10,58 @@
 	MOVW	$_rt0_arm_linux1(SB), R4
 	B		(R4)
 
+TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$40
+	// Preserve callee-save registers.  Raspberry Pi's dlopen(), for example,
+	// actually cares that R11 is preserved.
+	MOVW	R4, 16(R13)
+	MOVW	R5, 20(R13)
+	MOVW	R6, 24(R13)
+	MOVW	R7, 28(R13)
+	MOVW	R8, 32(R13)
+	MOVW	R11, 36(R13)
+
+	// Save argc/argv.
+	MOVW	R0, _rt0_arm_linux_lib_argc<>(SB)
+	MOVW	R1, _rt0_arm_linux_lib_argv<>(SB)
+
+	// Create a new thread to do the runtime initialization.
+	MOVW	_cgo_sys_thread_create(SB), R2
+	CMP	$0, R2
+	BEQ	nocgo
+	MOVW	$_rt0_arm_linux_lib_go<>(SB), R0
+	MOVW	$0, R1
+	BL	(R2)
+	B	rr
+nocgo:
+	MOVW	$0x800000, R0                     // stacksize = 8192KB
+	MOVW	$_rt0_arm_linux_lib_go<>(SB), R1  // fn
+	MOVW	$0, R2                            // fnarg
+	MOVW	R0, 4(R13)
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	BL	runtime·newosproc0(SB)
+rr:
+	// Restore callee-save registers and return.
+	MOVW	16(R13), R4
+	MOVW	20(R13), R5
+	MOVW	24(R13), R6
+	MOVW	28(R13), R7
+	MOVW	32(R13), R8
+	MOVW	36(R13), R11
+	RET
+
+TEXT _rt0_arm_linux_lib_go<>(SB),NOSPLIT,$8
+	MOVW	_rt0_arm_linux_lib_argc<>(SB), R0
+	MOVW	_rt0_arm_linux_lib_argv<>(SB), R1
+	MOVW	R0, 0(R13)
+	MOVW	R1, 4(R13)
+	B	runtime·rt0_go(SB)
+
+DATA _rt0_arm_linux_lib_argc<>(SB)/4,$0
+GLOBL _rt0_arm_linux_lib_argc<>(SB),NOPTR,$4
+DATA _rt0_arm_linux_lib_argv<>(SB)/4,$0
+GLOBL _rt0_arm_linux_lib_argv<>(SB),NOPTR,$4
+
 TEXT _rt0_arm_linux1(SB),NOSPLIT,$-4
 	// We first need to detect the kernel ABI, and warn the user
 	// if the system only supports OABI
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index db9fcb9..d422f95 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -310,12 +310,43 @@
 	MOVW	R0, (R1)
 
 // int32 clone0(int32 flags, void *stack, void* fn, void* fnarg);
-TEXT runtime·clone0(SB),NOSPLIT,$0
-	// TODO(spetrovic): Implement this method.
-	MOVW	$-1, R0
+TEXT runtime·clone0(SB),NOSPLIT,$0-20
+	MOVW	flags+0(FP), R0
+	MOVW	stack+4(FP), R1
+	// Update child's future stack and save fn and fnarg on it.
+	MOVW	$-8(R1), R1
+	MOVW	fn+8(FP), R6
+	MOVW	R6, 0(R1)
+	MOVW	fnarg+12(FP), R6
+	MOVW	R6, 4(R1)
+	MOVW	$0, R2	// parent tid ptr
+	MOVW	$0, R3	// tls_val
+	MOVW	$0, R4	// child tid ptr
+	MOVW	$0, R5
+	MOVW	$SYS_clone, R7
+	SWI	$0
+
+	// In parent, return.
+	CMP	$0, R0
+	BEQ	3(PC)
 	MOVW	R0, ret+16(FP)
 	RET
 
+	// In child.
+	MOVW	0(R13), R6   // fn
+	MOVW	4(R13), R0   // fnarg
+	MOVW	$8(R13), R13
+	BL	(R6)
+
+	MOVW	$0, R0
+	MOVW	R0, 4(R13)
+	BL	runtime·exit1(SB)
+
+	// It shouldn't return
+	MOVW	$1234, R0
+	MOVW	$1005, R1
+	MOVW	R0, (R1)
+
 TEXT runtime·sigaltstack(SB),NOSPLIT,$0
 	MOVW	new+0(FP), R0
 	MOVW	old+4(FP), R1