runtime: scheduler, cgo reorganization

* Change use of m->g0 stack (aka scheduler stack).
* Provide runtime.mcall(f) to invoke f() on m->g0 stack.
* Replace scheduler loop entry with runtime.mcall(schedule).

Runtime.mcall eliminates the need for fake scheduler states that
exist just to run a bit of code on the m->g0 stack
(Grecovery, Gstackalloc).

The elimination of the scheduler as a loop that stops and
starts using gosave and gogo fixes a bad interaction with the
way cgo uses the m->g0 stack.  Cgo runs external (gcc-compiled)
C functions on that stack, and then when calling back into Go,
it sets m->g0->sched.sp below the added call frames, so that
other uses of m->g0's stack will not interfere with those frames.
Unfortunately, gogo (longjmp) back to the scheduler loop at
this point would end up running scheduler with the lower
sp, which no longer points at a valid stack frame for
a call to scheduler.  If scheduler then wrote any function call
arguments or local variables to where it expected the stack
frame to be, it would overwrite other data on the stack.
I realized this possibility while debugging a problem with
calling complex Go code in a Go -> C -> Go cgo callback.
This wasn't the bug I was looking for, it turns out, but I believe
it is a real bug nonetheless.  Switching to runtime.mcall, which
only adds new frames to the stack and never jumps into
functions running in existing ones, fixes this bug.

* Move cgo-related code out of proc.c into cgocall.c.
* Add very large comment describing cgo call sequences.
* Simpilify, regularize cgo function implementations and names.
* Add test suite as misc/cgo/test.

Now the Go -> C path calls cgocall, which calls asmcgocall,
and the C -> Go path calls cgocallback, which calls cgocallbackg.

The shuffling, which affects mainly the callback case, moves
most of the callback implementation to cgocallback running
on the m->curg stack (not the m->g0 scheduler stack) and
only while accounted for with $GOMAXPROCS (between calls
to exitsyscall and entersyscall).

The previous callback code did not block in startcgocallback's
approximation to exitsyscall, so if, say, the garbage collector
were running, it would still barge in and start doing things
like call malloc.  Similarly endcgocallback's approximation of
entersyscall did not call matchmg to kick off new OS threads
when necessary, which caused the bug in issue 1560.

Fixes #1560.

R=iant
CC=golang-dev
https://golang.org/cl/4253054
diff --git a/src/pkg/runtime/arm/asm.s b/src/pkg/runtime/arm/asm.s
index f9fe7e6..4d36606 100644
--- a/src/pkg/runtime/arm/asm.s
+++ b/src/pkg/runtime/arm/asm.s
@@ -93,14 +93,13 @@
  *  go-routine
  */
 
-// uintptr gosave(Gobuf*)
+// void gosave(Gobuf*)
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB), 7, $-4
 	MOVW	0(FP), R0		// gobuf
 	MOVW	SP, gobuf_sp(R0)
 	MOVW	LR, gobuf_pc(R0)
 	MOVW	g, gobuf_g(R0)
-	MOVW	$0, R0			// return 0
 	RET
 
 // void gogo(Gobuf*, uintptr)
@@ -127,6 +126,30 @@
 	MOVW	gobuf_pc(R0), LR
 	MOVW	R1, PC
 
+// void mcall(void (*fn)(G*))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->gobuf)
+// to keep running g.
+TEXT runtime·mcall(SB), 7, $-4
+	MOVW	fn+0(FP), R0
+
+	// Save caller state in g->gobuf.
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_pc)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVW	g, R1
+	MOVW	m_g0(m), g
+	CMP	g, R1
+	BL.EQ	runtime·badmcall(SB)
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	SUB	$8, SP
+	MOVW	R1, 4(SP)
+	BL	(R0)
+	BL	runtime·badmcall2(SB)
+	RET
+
 /*
  * support for morestack
  */
@@ -159,9 +182,9 @@
 	// Set m->morepc to f's PC.
 	MOVW	LR, m_morepc(m)
 
-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·newstack(SB)
 
 // Called from reflection library.  Mimics morestack,
@@ -192,9 +215,9 @@
 	MOVW	$1, R3
 	MOVW	R3, m_moreframesize(m)		// f's frame size
 
-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·newstack(SB)
 
 // Return point when leaving stack.
@@ -203,9 +226,9 @@
 	// Save return value in m->cret
 	MOVW	R0, m_cret(m)
 
-	// Call oldstack on m's scheduling stack.
+	// Call oldstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·oldstack(SB)
 
 // void jmpdefer(fn, sp);
@@ -221,6 +244,12 @@
 	MOVW	$-4(SP), SP	// SP is 4 below argp, due to saved LR
 	B		(R0)
 
+TEXT	runtime·asmcgocall(SB),7,$0
+	B	runtime·cgounimpl(SB)
+
+TEXT	runtime·cgocallback(SB),7,$0
+	B	runtime·cgounimpl(SB)
+
 TEXT runtime·memclr(SB),7,$20
 	MOVW	0(FP), R0
 	MOVW	$0, R1		// c = 0
@@ -248,22 +277,6 @@
 	MOVW	$-4(R0), R0
 	RET
 
-// runcgo(void(*fn)(void*), void *arg)
-// Just call fn(arg), but first align the stack
-// appropriately for the gcc ABI.
-// TODO(kaib): figure out the arm-gcc ABI
-TEXT runtime·runcgo(SB),7,$16
-	BL	runtime·abort(SB)
-//	MOVL	fn+0(FP), AX
-//	MOVL	arg+4(FP), BX
-//	MOVL	SP, CX
-//	ANDL	$~15, SP	// alignment for gcc ABI
-//	MOVL	CX, 4(SP)
-//	MOVL	BX, 0(SP)
-//	CALL	AX
-//	MOVL	4(SP), SP
-//	RET
-
 TEXT runtime·emptyfunc(SB),0,$0
 	RET
 
@@ -271,10 +284,6 @@
 	MOVW	$0, R0
 	MOVW	(R0), R1
 
-TEXT runtime·runcgocallback(SB),7,$0
-	MOVW	$0, R0
-	MOVW	(R0), R1
-
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
 //	if(*val == old){