ld: detect stack overflow due to NOSPLIT

Fix problems found.

On amd64, various library routines had bigger
stack frames than expected, because large function
calls had been added.

runtime.assertI2T: nosplit stack overflow
        120	assumed on entry to runtime.assertI2T
        8	after runtime.assertI2T uses 112
        0	on entry to runtime.newTypeAssertionError
        -8	on entry to runtime.morestack01

runtime.assertE2E: nosplit stack overflow
        120	assumed on entry to runtime.assertE2E
        16	after runtime.assertE2E uses 104
        8	on entry to runtime.panic
        0	on entry to runtime.morestack16
        -8	after runtime.morestack16 uses 8

runtime.assertE2T: nosplit stack overflow
        120	assumed on entry to runtime.assertE2T
        16	after runtime.assertE2T uses 104
        8	on entry to runtime.panic
        0	on entry to runtime.morestack16
        -8	after runtime.morestack16 uses 8

runtime.newselect: nosplit stack overflow
        120	assumed on entry to runtime.newselect
        56	after runtime.newselect uses 64
        48	on entry to runtime.printf
        8	after runtime.printf uses 40
        0	on entry to vprintf
        -8	on entry to runtime.morestack16

runtime.selectdefault: nosplit stack overflow
        120	assumed on entry to runtime.selectdefault
        56	after runtime.selectdefault uses 64
        48	on entry to runtime.printf
        8	after runtime.printf uses 40
        0	on entry to vprintf
        -8	on entry to runtime.morestack16

runtime.selectgo: nosplit stack overflow
        120	assumed on entry to runtime.selectgo
        0	after runtime.selectgo uses 120
        -8	on entry to runtime.gosched

On arm, 5c was tagging functions NOSPLIT that should
not have been, like the recursive function printpanics:

printpanics: nosplit stack overflow
        124	assumed on entry to printpanics
        112	after printpanics uses 12
        108	on entry to printpanics
        96	after printpanics uses 12
        92	on entry to printpanics
        80	after printpanics uses 12
        76	on entry to printpanics
        64	after printpanics uses 12
        60	on entry to printpanics
        48	after printpanics uses 12
        44	on entry to printpanics
        32	after printpanics uses 12
        28	on entry to printpanics
        16	after printpanics uses 12
        12	on entry to printpanics
        0	after printpanics uses 12
        -4	on entry to printpanics

R=r, r2
CC=golang-dev
https://golang.org/cl/4188061
diff --git a/src/pkg/runtime/arm/asm.s b/src/pkg/runtime/arm/asm.s
index a4e4b32..93c4d4c 100644
--- a/src/pkg/runtime/arm/asm.s
+++ b/src/pkg/runtime/arm/asm.s
@@ -12,10 +12,10 @@
 	// use R13 instead of SP to avoid linker rewriting the offsets
 	MOVW	0(R13), R0		// argc
 	MOVW	$4(R13), R1		// argv
-	SUB	$128, R13		// plenty of scratch
+	SUB	$64, R13		// plenty of scratch
 	AND	$~7, R13
-	MOVW	R0, 120(R13)		// save argc, argv away
-	MOVW	R1, 124(R13)
+	MOVW	R0, 60(R13)		// save argc, argv away
+	MOVW	R1, 64(R13)
 
 	// set up m and g registers
 	// g is R10, m is R9
@@ -34,9 +34,9 @@
 	BL	runtime·check(SB)
 
 	// saved argc, argv
-	MOVW	120(R13), R0
+	MOVW	60(R13), R0
 	MOVW	R0, 4(R13)
-	MOVW	124(R13), R1
+	MOVW	64(R13), R1
 	MOVW	R1, 8(R13)
 	BL	runtime·args(SB)
 	BL	runtime·osinit(SB)
diff --git a/src/pkg/runtime/cgocall.c b/src/pkg/runtime/cgocall.c
index 74e5a30..741e8f0 100644
--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "stack.h"
 #include "cgocall.h"
 
 void *initcgo;	/* filled in by dynamic linker when Cgo is available */
@@ -70,7 +71,7 @@
 	runtime·startcgocallback(g1);
 
 	sp = g1->sched.sp - argsize;
-	if(sp < g1->stackguard - StackGuard + 8) // +8 for return address
+	if(sp < g1->stackguard - StackGuard - StackSystem + 8) // +8 for return address
 		runtime·throw("g stack overflow in cgocallback");
 	runtime·mcpy(sp, arg, argsize);
 
diff --git a/src/pkg/runtime/chan.c b/src/pkg/runtime/chan.c
index 8d3ac2c..28c7d73 100644
--- a/src/pkg/runtime/chan.c
+++ b/src/pkg/runtime/chan.c
@@ -495,17 +495,27 @@
 	runtime·chanrecv(c, v, &ok, nil);
 }	
 
+static void newselect(int32, Select**);
+
 // newselect(size uint32) (sel *byte);
 #pragma textflag 7
 void
 runtime·newselect(int32 size, ...)
 {
-	int32 n, o;
+	int32 o;
 	Select **selp;
-	Select *sel;
 
 	o = runtime·rnd(sizeof(size), Structrnd);
 	selp = (Select**)((byte*)&size + o);
+	newselect(size, selp);
+}
+
+static void
+newselect(int32 size, Select **selp)
+{
+	int32 n;
+	Select *sel;
+
 	n = 0;
 	if(size > 1)
 		n = size-1;
@@ -589,21 +599,31 @@
 }
 
 
-// selectdefaul(sel *byte) (selected bool);
+static void selectdefault(Select**);
+
+// selectdefault(sel *byte) (selected bool);
 #pragma textflag 7
 void
 runtime·selectdefault(Select *sel, ...)
 {
+	selectdefault(&sel);
+}
+
+static void
+selectdefault(Select **selp)
+{
+	Select *sel;
 	int32 i;
 	Scase *cas;
 
+	sel = *selp;
 	i = sel->ncase;
 	if(i >= sel->tcase)
 		runtime·throw("selectdefault: too many cases");
 	sel->ncase = i+1;
 	cas = runtime·mal(sizeof *cas);
 	sel->scase[i] = cas;
-	cas->pc = runtime·getcallerpc(&sel);
+	cas->pc = runtime·getcallerpc(selp);
 	cas->chan = nil;
 
 	cas->so = runtime·rnd(sizeof(sel), Structrnd);
@@ -662,16 +682,23 @@
 	runtime·gosched();
 }
 
+static void selectgo(Select**);
+
 // selectgo(sel *byte);
 //
 // overwrites return pc on stack to signal which case of the select
 // to run, so cannot appear at the top of a split stack.
-// frame has 6 pointers and 4 int32 so 64 bytes max.
-// that's less than StackGuard-StackSmall, so okay.
 #pragma textflag 7
 void
 runtime·selectgo(Select *sel)
 {
+	selectgo(&sel);
+}
+
+static void
+selectgo(Select **selp)
+{
+	Select *sel;
 	uint32 o, i, j;
 	Scase *cas, *dfl;
 	Hchan *c;
@@ -679,6 +706,7 @@
 	G *gp;
 	byte *as;
 
+	sel = *selp;
 	if(runtime·gcwaiting)
 		runtime·gosched();
 
@@ -889,8 +917,8 @@
 	selunlock(sel);
 
 	// return to pc corresponding to chosen case
-	runtime·setcallerpc(&sel, cas->pc);
-	as = (byte*)&sel + cas->so;
+	runtime·setcallerpc(selp, cas->pc);
+	as = (byte*)selp + cas->so;
 	freesel(sel);
 	*as = true;
 	return;
diff --git a/src/pkg/runtime/iface.c b/src/pkg/runtime/iface.c
index 3dec45e..698aead 100644
--- a/src/pkg/runtime/iface.c
+++ b/src/pkg/runtime/iface.c
@@ -209,16 +209,25 @@
 	copyin(t, elem, &ret->data);
 }
 
+static void assertI2Tret(Type *t, Iface i, byte *ret);
+
 // func ifaceI2T(typ *byte, iface any) (ret any)
 #pragma textflag 7
 void
 runtime·assertI2T(Type *t, Iface i, ...)
 {
-	Itab *tab;
 	byte *ret;
-	Eface err;
 
 	ret = (byte*)(&i+1);
+	assertI2Tret(t, i, ret);
+}
+
+static void
+assertI2Tret(Type *t, Iface i, byte *ret)
+{
+	Itab *tab;
+	Eface err;
+
 	tab = i.tab;
 	if(tab == nil) {
 		runtime·newTypeAssertionError(nil, nil, t,
@@ -258,15 +267,23 @@
 	copyout(t, &i.data, ret);
 }
 
+static void assertE2Tret(Type *t, Eface e, byte *ret);
+
 // func ifaceE2T(typ *byte, iface any) (ret any)
 #pragma textflag 7
 void
 runtime·assertE2T(Type *t, Eface e, ...)
 {
 	byte *ret;
-	Eface err;
 
 	ret = (byte*)(&e+1);
+	assertE2Tret(t, e, ret);
+}
+
+static void
+assertE2Tret(Type *t, Eface e, byte *ret)
+{
+	Eface err;
 
 	if(e.type == nil) {
 		runtime·newTypeAssertionError(nil, nil, t,
@@ -307,7 +324,6 @@
 }
 
 // func convI2E(elem any) (ret any)
-#pragma textflag 7
 void
 runtime·convI2E(Iface i, Eface ret)
 {
@@ -322,7 +338,6 @@
 }
 
 // func ifaceI2E(typ *byte, iface any) (ret any)
-#pragma textflag 7
 void
 runtime·assertI2E(InterfaceType* inter, Iface i, Eface ret)
 {
@@ -343,7 +358,6 @@
 }
 
 // func ifaceI2E2(typ *byte, iface any) (ret any, ok bool)
-#pragma textflag 7
 void
 runtime·assertI2E2(InterfaceType* inter, Iface i, Eface ret, bool ok)
 {
@@ -364,7 +378,6 @@
 }
 
 // func convI2I(typ *byte, elem any) (ret any)
-#pragma textflag 7
 void
 runtime·convI2I(InterfaceType* inter, Iface i, Iface ret)
 {
@@ -399,7 +412,6 @@
 }
 
 // func ifaceI2I(sigi *byte, iface any) (ret any)
-#pragma textflag 7
 void
 runtime·assertI2I(InterfaceType* inter, Iface i, Iface ret)
 {
@@ -407,7 +419,6 @@
 }
 
 // func ifaceI2I2(sigi *byte, iface any) (ret any, ok bool)
-#pragma textflag 7
 void
 runtime·assertI2I2(InterfaceType *inter, Iface i, Iface ret, bool ok)
 {
@@ -446,7 +457,6 @@
 }
 
 // func ifaceE2I(sigi *byte, iface any) (ret any)
-#pragma textflag 7
 void
 runtime·assertE2I(InterfaceType* inter, Eface e, Iface ret)
 {
@@ -454,7 +464,6 @@
 }
 
 // ifaceE2I2(sigi *byte, iface any) (ret any, ok bool)
-#pragma textflag 7
 void
 runtime·assertE2I2(InterfaceType *inter, Eface e, Iface ret, bool ok)
 {
@@ -474,7 +483,6 @@
 }
 
 // func ifaceE2E(typ *byte, iface any) (ret any)
-#pragma textflag 7
 void
 runtime·assertE2E(InterfaceType* inter, Eface e, Eface ret)
 {
@@ -494,7 +502,6 @@
 }
 
 // func ifaceE2E2(iface any) (ret any, ok bool)
-#pragma textflag 7
 void
 runtime·assertE2E2(InterfaceType* inter, Eface e, Eface ret, bool ok)
 {
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index 745e18c..abbf63b 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -8,6 +8,7 @@
 
 package runtime
 #include "runtime.h"
+#include "stack.h"
 #include "malloc.h"
 #include "defs.h"
 #include "type.h"
@@ -385,7 +386,7 @@
 } stacks;
 
 enum {
-	FixedStack = StackBig + StackExtra
+	FixedStack = StackMin,
 };
 
 void*
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 84cd517..1bbca63 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -7,6 +7,7 @@
 #include "defs.h"
 #include "malloc.h"
 #include "os.h"
+#include "stack.h"
 
 bool	runtime·iscgo;
 
@@ -701,7 +702,7 @@
 	goid = old.gobuf.g->goid;	// fault if g is bad, before gogo
 
 	if(old.free != 0)
-		runtime·stackfree(g1->stackguard - StackGuard, old.free);
+		runtime·stackfree(g1->stackguard - StackGuard - StackSystem, old.free);
 	g1->stackbase = old.stackbase;
 	g1->stackguard = old.stackguard;
 
@@ -739,14 +740,15 @@
 		// the new Stktop* is necessary to unwind, but
 		// we don't need to create a new segment.
 		top = (Stktop*)(m->morebuf.sp - sizeof(*top));
-		stk = g1->stackguard - StackGuard;
+		stk = g1->stackguard - StackGuard - StackSystem;
 		free = 0;
 	} else {
 		// allocate new segment.
 		framesize += argsize;
-		if(framesize < StackBig)
-			framesize = StackBig;
 		framesize += StackExtra;	// room for more functions, Stktop.
+		if(framesize < StackMin)
+			framesize = StackMin;
+		framesize += StackSystem;
 		stk = runtime·stackalloc(framesize);
 		top = (Stktop*)(stk+framesize-sizeof(*top));
 		free = framesize;
@@ -767,7 +769,7 @@
 	g1->ispanic = false;
 
 	g1->stackbase = (byte*)top;
-	g1->stackguard = stk + StackGuard;
+	g1->stackguard = stk + StackGuard + StackSystem;
 
 	sp = (byte*)top;
 	if(argsize > 0) {
@@ -798,10 +800,10 @@
 
 	g = runtime·malloc(sizeof(G));
 	if(stacksize >= 0) {
-		stk = runtime·stackalloc(stacksize + StackGuard);
+		stk = runtime·stackalloc(StackSystem + stacksize);
 		g->stack0 = stk;
-		g->stackguard = stk + StackGuard;
-		g->stackbase = stk + StackGuard + stacksize - sizeof(Stktop);
+		g->stackguard = stk + StackSystem + StackGuard;
+		g->stackbase = stk + StackSystem + stacksize - sizeof(Stktop);
 		runtime·memclr(g->stackbase, sizeof(Stktop));
 	}
 	return g;
@@ -846,10 +848,10 @@
 
 	if((newg = gfget()) != nil){
 		newg->status = Gwaiting;
-		if(newg->stackguard - StackGuard != newg->stack0)
+		if(newg->stackguard - StackGuard - StackSystem != newg->stack0)
 			runtime·throw("invalid stack in newg");
 	} else {
-		newg = runtime·malg(StackBig);
+		newg = runtime·malg(StackMin);
 		newg->status = Gwaiting;
 		newg->alllink = runtime·allg;
 		runtime·allg = newg;
@@ -1099,7 +1101,7 @@
 static void
 gfput(G *g)
 {
-	if(g->stackguard - StackGuard != g->stack0)
+	if(g->stackguard - StackGuard - StackSystem != g->stack0)
 		runtime·throw("invalid stack in gfput");
 	g->schedlink = runtime·sched.gfree;
 	runtime·sched.gfree = g;
diff --git a/src/pkg/runtime/runtime.c b/src/pkg/runtime/runtime.c
index ef2def0..e85bc9d 100644
--- a/src/pkg/runtime/runtime.c
+++ b/src/pkg/runtime/runtime.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "stack.h"
 
 enum {
 	maxround = sizeof(uintptr),
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index a020100..ac992a2 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -592,83 +592,17 @@
 
 void	runtime·ifaceE2I(struct InterfaceType*, Eface, Iface*);
 
-/*
- * Stack layout parameters.
- * Known to linkers.
- *
- * The per-goroutine g->stackguard is set to point
- * StackGuard bytes above the bottom of the stack.
- * Each function compares its stack pointer against
- * g->stackguard to check for overflow.  To cut one
- * instruction from the check sequence for functions
- * with tiny frames, the stack is allowed to protrude
- * StackSmall bytes below the stack guard.  Functions
- * with large frames don't bother with the check and
- * always call morestack.  The sequences are
- * (for amd64, others are similar):
- *
- * 	guard = g->stackguard
- * 	frame = function's stack frame size
- * 	argsize = size of function arguments (call + return)
- *
- * 	stack frame size <= StackSmall:
- * 		CMPQ guard, SP
- * 		JHI 3(PC)
- * 		MOVQ m->morearg, $(argsize << 32)
- * 		CALL morestack(SB)
- *
- * 	stack frame size > StackSmall but < StackBig
- * 		LEAQ (frame-StackSmall)(SP), R0
- * 		CMPQ guard, R0
- * 		JHI 3(PC)
- * 		MOVQ m->morearg, $(argsize << 32)
- * 		CALL morestack(SB)
- *
- * 	stack frame size >= StackBig:
- * 		MOVQ m->morearg, $((argsize << 32) | frame)
- * 		CALL morestack(SB)
- *
- * The bottom StackGuard - StackSmall bytes are important:
- * there has to be enough room to execute functions that
- * refuse to check for stack overflow, either because they
- * need to be adjacent to the actual caller's frame (deferproc)
- * or because they handle the imminent stack overflow (morestack).
- *
- * For example, deferproc might call malloc, which does one
- * of the above checks (without allocating a full frame),
- * which might trigger a call to morestack.  This sequence
- * needs to fit in the bottom section of the stack.  On amd64,
- * morestack's frame is 40 bytes, and deferproc's frame is 56 bytes.
- * That fits well within the StackGuard - StackSmall = 128 bytes
- * at the bottom.  There may be other sequences lurking or yet to
- * be written that require more stack.  Morestack checks to make
- * sure the stack has not completely overflowed and should catch
- * such sequences.
- */
 enum
 {
+	// StackSystem is a number of additional bytes to add
+	// to each stack below the usual guard area for OS-specific
+	// purposes like signal handling.
+	// TODO(rsc): This is only for Windows.  Can't Windows use
+	// a separate exception stack like every other operating system?
 #ifdef __WINDOWS__
-	// need enough room in guard area for exception handler.
-	// use larger stacks to compensate for larger stack guard.
-	StackSmall = 256,
-	StackGuard = 2048,
-	StackBig   = 8192,
-	StackExtra = StackGuard,
+	StackSystem = 2048,
 #else
-	// byte offset of stack guard (g->stackguard) above bottom of stack.
-	StackGuard = 256,
-
-	// checked frames are allowed to protrude below the guard by
-	// this many bytes.  this saves an instruction in the checking
-	// sequence when the stack frame is tiny.
-	StackSmall = 128,
-
-	// extra space in the frame (beyond the function for which
-	// the frame is allocated) is assumed not to be much bigger
-	// than this amount.  it may not be used efficiently if it is.
-	StackBig = 4096,
-
-	// extra room over frame size when allocating a stack.
-	StackExtra = 1024,
+	StackSystem = 0,
 #endif
 };
+
diff --git a/src/pkg/runtime/stack.h b/src/pkg/runtime/stack.h
new file mode 100644
index 0000000..ebf0462
--- /dev/null
+++ b/src/pkg/runtime/stack.h
@@ -0,0 +1,86 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Stack layout parameters.
+Included both by runtime (compiled via 6c) and linkers (compiled via gcc).
+
+The per-goroutine g->stackguard is set to point StackGuard bytes
+above the bottom of the stack.  Each function compares its stack
+pointer against g->stackguard to check for overflow.  To cut one
+instruction from the check sequence for functions with tiny frames,
+the stack is allowed to protrude StackSmall bytes below the stack
+guard.  Functions with large frames don't bother with the check and
+always call morestack.  The sequences are (for amd64, others are
+similar):
+ 
+	guard = g->stackguard
+	frame = function's stack frame size
+	argsize = size of function arguments (call + return)
+
+	stack frame size <= StackSmall:
+		CMPQ guard, SP
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size > StackSmall but < StackBig
+		LEAQ (frame-StackSmall)(SP), R0
+		CMPQ guard, R0
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size >= StackBig:
+		MOVQ m->morearg, $((argsize << 32) | frame)
+		CALL morestack(SB)
+
+The bottom StackGuard - StackSmall bytes are important: there has
+to be enough room to execute functions that refuse to check for
+stack overflow, either because they need to be adjacent to the
+actual caller's frame (deferproc) or because they handle the imminent
+stack overflow (morestack).
+
+For example, deferproc might call malloc, which does one of the
+above checks (without allocating a full frame), which might trigger
+a call to morestack.  This sequence needs to fit in the bottom
+section of the stack.  On amd64, morestack's frame is 40 bytes, and
+deferproc's frame is 56 bytes.  That fits well within the
+StackGuard - StackSmall = 128 bytes at the bottom.  
+The linkers explore all possible call traces involving non-splitting
+functions to make sure that this limit cannot be violated.
+ */
+
+enum {
+	// The amount of extra stack to allocate beyond the size
+	// needed for the single frame that triggered the split.
+	StackExtra = 1024,
+
+	// The minimum stack segment size to allocate.
+	// If the amount needed for the splitting frame + StackExtra
+	// is less than this number, the stack will have this size instead.
+	StackMin = 4096,
+
+	// Functions that need frames bigger than this call morestack
+	// unconditionally.  That is, on entry to a function it is assumed
+	// that the amount of space available in the current stack segment
+	// couldn't possibly be bigger than StackBig.  If stack segments
+	// do run with more space than StackBig, the space may not be
+	// used efficiently.  As a result, StackBig should not be significantly
+	// smaller than StackMin or StackExtra.
+	StackBig = 4096,
+
+	// The stack guard is a pointer this many bytes above the
+	// bottom of the stack.
+	StackGuard = 256,
+
+	// After a stack split check the SP is allowed to be this
+	// many bytes below the stack guard.  This saves an instruction
+	// in the checking sequence for tiny frames.
+	StackSmall = 128,
+
+	// The maximum number of bytes that a chain of NOSPLIT
+	// functions can use.
+	StackLimit = StackGuard - StackSmall,
+};