runtime: call rfork on scheduler stack on Plan 9

A race exists between the parent and child processes after a fork.
The child needs to access the new M pointer passed as an argument
but the parent may have already returned and clobbered it.

Previously, we avoided this by saving the necessary data into
registers before the rfork system call but this isn't guaranteed
to work because Plan 9 makes no promises about the register state
after a system call. Only the 386 kernel seems to save them.
For amd64 and arm, this method won't work.

We eliminate the race by allocating stack space for the scheduler
goroutines (g0) in the per-process copy-on-write stack segment and
by only calling rfork on the scheduler stack.

LGTM=aram, 0intro, rsc
R=aram, 0intro, mischief, rsc
CC=golang-codereviews
https://golang.org/cl/110680044
diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index 1256347..a41b562 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s
@@ -131,47 +131,38 @@
 	INT	$64
 	MOVL	AX, ret+8(FP)
 	RET
-	
-TEXT runtime·rfork(SB),NOSPLIT,$0
-	MOVL	$19, AX // rfork
-	MOVL	stack+8(SP), CX
-	MOVL	mm+12(SP), BX	// m
-	MOVL	gg+16(SP), DX	// g
-	MOVL	fn+20(SP), SI	// fn
-	INT     $64
 
-	// In parent, return.
-	CMPL	AX, $0
-	JEQ	3(PC)
-	MOVL	AX, ret+20(FP)
+TEXT runtime·rfork(SB),NOSPLIT,$0
+	MOVL	$19, AX
+	INT	$64
+	MOVL	AX, ret+4(FP)
 	RET
 
-	// set SP to be on the new child stack
-	MOVL	CX, SP
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
+	MOVL	newm+0(FP), CX
+	MOVL	m_g0(CX), DX
 
-	// Initialize m, g.
-	get_tls(AX)
-	MOVL	DX, g(AX)
-	MOVL	BX, g_m(DX)
+	// Layout new m scheduler stack on os stack.
+	MOVL	SP, AX
+	MOVL	AX, (g_stack+stack_hi)(DX)
+	SUBL	$(64*1024), AX		// stack size
+	MOVL	AX, (g_stack+stack_lo)(DX)
+	MOVL	AX, g_stackguard0(DX)
+	MOVL	AX, g_stackguard1(DX)
 
 	// Initialize procid from TOS struct.
 	MOVL	_tos(SB), AX
-	MOVL	48(AX), AX // procid
-	MOVL	AX, m_procid(BX)	// save pid as m->procid
-	
+	MOVL	48(AX), AX
+	MOVL	AX, m_procid(CX)	// save pid as m->procid
+
+	// Finally, initialize g.
+	get_tls(BX)
+	MOVL	DX, g(BX)
+
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
-	
-	MOVL	0(DX), DX	// paranoia; check they are not nil
-	MOVL	0(BX), BX
-	
-	// more paranoia; check that stack splitting code works
-	PUSHL	SI
-	CALL	runtime·emptyfunc(SB)
-	POPL	SI
-	
-	CALL	SI	// fn()
-	CALL	runtime·exit(SB)
-	MOVL	AX, ret+20(FP)
+	CALL	runtime·mstart(SB)
+
+	MOVL	$0x1234, 0x1234		// not reached
 	RET
 
 // void sigtramp(void *ureg, int8 *note)