segmented stacks

SVN=125267
diff --git a/src/cmd/6l/l.h b/src/cmd/6l/l.h
index 20bac85..8966ed2 100644
--- a/src/cmd/6l/l.h
+++ b/src/cmd/6l/l.h
@@ -346,8 +346,7 @@
 EXTERN	Prog	undefp;
 EXTERN	ulong	stroffset;
 EXTERN	vlong	textstksiz;
-EXTERN	vlong	textinarg;
-EXTERN	vlong	textoutarg;
+EXTERN	vlong	textarg;
 
 #define	UP	(&undefp)
 
diff --git a/src/cmd/6l/list.c b/src/cmd/6l/list.c
index 9832156..789e7ce 100644
--- a/src/cmd/6l/list.c
+++ b/src/cmd/6l/list.c
@@ -106,11 +106,11 @@
 			goto brk;
 		}
 		parsetextconst(a->offset);
-		if(textinarg == 0 && textoutarg == 0) {
+		if(textarg == 0) {
 			sprint(str, "$%lld", textstksiz);
 			goto brk;
 		}
-		sprint(str, "$%lld-%lld-%lld", textstksiz, textinarg, textoutarg);
+		sprint(str, "$%lld-%lld", textstksiz, textarg);
 		goto brk;
 	}
 
@@ -422,18 +422,15 @@
 	if(textstksiz & 0x80000000LL)
 		textstksiz = -(-textstksiz & 0xffffffffLL);
 		
-
-	// the following throws away one bit
-	// of precision, but maintains compat
-	textinarg = (arg >> 32) & 0xffffLL;
-	if(textinarg & 0x8000LL)
-		textinarg = -(-textinarg & 0xffffLL);
-	if(textinarg <= 0)
-		textinarg = 100;
-
-	textoutarg = (arg >> 48) & 0xffffLL;
-	if(textoutarg & 0x8000LL)
-		textoutarg = -(-textoutarg & 0xffffLL);
-	if(textoutarg <= 0)
-		textoutarg = 0;
+	textarg = (arg >> 32) & 0xffffffffLL;
+	if(textarg & 0x80000000LL)
+		textarg = 0;
+	if(textarg <= 0)
+		textarg = 100;
+	if(textarg > textstksiz) {
+		textarg = textstksiz;
+		if(textarg <= 0)
+			textarg = 0;
+	}
+	textarg = (textarg+7) & ~7LL;
 }
diff --git a/src/cmd/6l/pass.c b/src/cmd/6l/pass.c
index 468318a..d593908 100644
--- a/src/cmd/6l/pass.c
+++ b/src/cmd/6l/pass.c
@@ -668,7 +668,7 @@
 			q = P;
 			if(pmorestack != P)
 			if(!(p->from.scale & NOSPLIT)) {
-				if(autoffset <= 50) {
+				if(autoffset <= 75) {
 					// small stack
 					p = appendp(p);
 					p->as = ACMPQ;
@@ -678,14 +678,9 @@
 				} else {
 					// large stack
 					p = appendp(p);
-					p->as = AMOVQ;
-					p->from.type = D_SP;
-					p->to.type = D_AX;
-
-					p = appendp(p);
-					p->as = ASUBQ;
-					p->from.type = D_CONST;
-					p->from.offset = autoffset-50;
+					p->as = ALEAQ;
+					p->from.type = D_INDIR+D_SP;
+					p->from.offset = -(autoffset-75);
 					p->to.type = D_AX;
 
 					p = appendp(p);
@@ -693,6 +688,7 @@
 					p->from.type = D_AX;
 					p->to.type = D_INDIR+D_R15;
 				}
+
 				// common
 				p = appendp(p);
 				p->as = AJHI;
@@ -703,9 +699,14 @@
 				p = appendp(p);
 				p->as = AMOVQ;
 				p->from.type = D_CONST;
-				p->from.offset = curtext->to.offset;
+				p->from.offset = 0;
 				p->to.type = D_AX;
 
+				/* 160 comes from 3 calls (3*8) 4 safes (4*8) and 104 guard */
+				if(autoffset+160 > 4096)
+					p->from.offset = (autoffset+160) & ~7LL;
+				p->from.offset |= textarg<<32;
+
 				p = appendp(p);
 				p->as = ACALL;
 				p->to.type = D_BRANCH;
diff --git a/src/runtime/rt0_amd64_darwin.s b/src/runtime/rt0_amd64_darwin.s
index 2219489..8f2aed6 100644
--- a/src/runtime/rt0_amd64_darwin.s
+++ b/src/runtime/rt0_amd64_darwin.s
@@ -5,30 +5,40 @@
 
 TEXT	_rt0_amd64_darwin(SB),7,$-8
 
-// copy arguments forward on an even stack
-
+	// copy arguments forward on an even stack
 
 	MOVQ	0(SP), AX		// argc
 	LEAQ	8(SP), BX		// argv
+	SUBQ	$(4*8+7), SP		// 2args 2auto
 	ANDQ	$~7, SP
-	SUBQ	$32, SP
 	MOVQ	AX, 16(SP)
 	MOVQ	BX, 24(SP)
 
-// allocate the per-user block
+	// allocate the per-user block
 
 	LEAQ	peruser<>(SB), R15	// dedicated u. register
-	MOVQ	SP, AX
-	SUBQ	$4096, AX
-	MOVQ	AX, 0(R15)
+
+	LEAQ	(-4096+104+4*8)(SP), AX
+	MOVQ	AX, 0(R15)		// 0(R15) is stack limit (w 104b guard)
+
+	MOVL	$1024, AX
+	MOVL	AX, 0(SP)
+	CALL	mal(SB)
+
+	LEAQ	104(AX), BX
+	MOVQ	BX, 16(R15)		// 16(R15) is limit of istack (w 104b guard)
+
+	ADDQ	0(SP), AX
+	LEAQ	(-4*8)(AX), BX
+	MOVQ	BX, 24(R15)		// 24(R15) is base of istack (w auto*4)
 
 	CALL	check(SB)
 
-// process the arguments
+	// process the arguments
 
-	MOVL	16(SP), AX
+	MOVL	16(SP), AX		// copy argc
 	MOVL	AX, 0(SP)
-	MOVQ	24(SP), AX
+	MOVQ	24(SP), AX		// copy argv
 	MOVQ	AX, 8(SP)
 	CALL	args(SB)
 
@@ -38,15 +48,131 @@
 	MOVQ	AX, 0(SP)		// exit status
 	CALL	sys·exit(SB)
 
-	CALL	notok(SB)
-
-	ADDQ	$32, SP
+	CALL	notok(SB)		// fault
 	RET
 
+//
+// the calling sequence for a routine that
+// needs N bytes stack, A args.
+//
+//	N1 = (N+160 > 4096)? N+160: 0
+//	A1 = A
+//
+// if N <= 75
+//	CMPQ	SP, 0(R15)
+//	JHI	3(PC)
+//	MOVQ	$(N1<<0) | (A1<<32)), AX
+//	CALL	_morestack
+//
+// if N > 75
+//	LEAQ	(-N-75)(SP), AX
+//	CMPQ	AX, 0(R15)
+//	JHI	3(PC)
+//	MOVQ	$(N1<<0) | (A1<<32)), AX
+//	CALL	_morestack
+//
+
 TEXT	_morestack(SB), 7, $0
-	MOVQ	SP, AX
-	SUBQ	$1024, AX
+	// save stuff on interrupt stack
+
+	MOVQ	24(R15), BX		// istack
+	MOVQ	SP, 8(BX)		// old SP
+	MOVQ	AX, 16(BX)		// magic number
+	MOVQ	0(R15), AX		// old limit
+	MOVQ	AX, 24(BX)
+
+	// switch and set up new limit
+
+	MOVQ	BX, SP
+	MOVQ	16(R15), AX		// istack limit
 	MOVQ	AX, 0(R15)
+
+	// allocate a new stack max of request and 4k
+
+	MOVL	16(SP), AX		// magic number
+	CMPL	AX, $4096
+	JHI	2(PC)
+	MOVL	$4096, AX
+	MOVL	AX, 0(SP)
+	CALL	mal(SB)
+
+	// switch to new stack
+
+	MOVQ	SP, BX			// istack
+	ADDQ	$104, AX		// new stack limit
+	MOVQ	AX, 0(R15)
+	ADDQ	0(SP), AX
+	LEAQ	(-104-4*8)(AX), SP	// new SP
+	MOVQ	8(R15), AX
+	MOVQ	AX, 0(SP)		// old base
+	MOVQ	SP, 8(R15)		// new base
+
+	// copy needed stuff from istack to new stack
+
+	MOVQ	16(BX), AX		// magic number
+	MOVQ	AX, 16(SP)
+	MOVQ	24(BX), AX		// old limit
+	MOVQ	AX, 24(SP)
+	MOVQ	8(BX), AX		// old SP
+	MOVQ	AX, 8(SP)
+
+// are there parameters
+
+	MOVL	20(SP), CX		// copy count
+	CMPL	CX, $0
+	JEQ	easy
+
+// copy in
+
+	LEAQ	16(AX), SI
+	SUBQ	CX, SP
+	MOVQ	SP, DI
+	SHRL	$3, CX
+	CLD
+	REP
+	MOVSQ
+
+	// call the intended
+	CALL	0(AX)
+
+// copy out
+
+	MOVQ	SP, SI
+	MOVQ	8(R15), BX		// new base
+	MOVQ	8(BX), AX		// old SP
+	LEAQ	16(AX), DI
+	MOVL	20(BX), CX		// copy count
+	SHRL	$3, CX
+	CLD
+	REP
+	MOVSQ
+
+	// restore old SP and limit
+	MOVQ	8(R15), SP		// new base
+	MOVQ	24(SP), AX		// old limit
+	MOVQ	AX, 0(R15)
+	MOVQ	0(SP), AX
+	MOVQ	AX, 8(R15)		// old base
+	MOVQ	8(SP), AX		// old SP
+	MOVQ	AX, SP
+
+	// and return to the call behind mine
+	ADDQ	$8, SP
+	RET
+
+easy:
+	CALL	0(AX)
+
+	// restore old SP and limit
+	MOVQ	24(SP), AX		// old limit
+	MOVQ	AX, 0(R15)
+	MOVQ	0(SP), AX
+	MOVQ	AX, 8(R15)		// old base
+	MOVQ	8(SP), AX		// old SP
+	MOVQ	AX, SP
+
+	// and return to the call behind mine
+	ADDQ	$8, SP
 	RET
 
 TEXT	FLUSH(SB),7,$-8
diff --git a/src/runtime/rt0_amd64_linux.s b/src/runtime/rt0_amd64_linux.s
index 1dd77e4..fdda7e1 100644
--- a/src/runtime/rt0_amd64_linux.s
+++ b/src/runtime/rt0_amd64_linux.s
@@ -5,30 +5,40 @@
 
 TEXT	_rt0_amd64_linux(SB),7,$-8
 
-// copy arguments forward on an even stack
-
+	// copy arguments forward on an even stack
 
 	MOVQ	0(SP), AX		// argc
 	LEAQ	8(SP), BX		// argv
+	SUBQ	$(4*8+7), SP		// 2args 2auto
 	ANDQ	$~7, SP
-	SUBQ	$32, SP
 	MOVQ	AX, 16(SP)
 	MOVQ	BX, 24(SP)
 
-// allocate the per-user block
+	// allocate the per-user block
 
 	LEAQ	peruser<>(SB), R15	// dedicated u. register
-	MOVQ	SP, AX
-	SUBQ	$4096, AX
-	MOVQ	AX, 0(R15)
+
+	LEAQ	(-4096+104+4*8)(SP), AX
+	MOVQ	AX, 0(R15)		// 0(R15) is stack limit (w 104b guard)
+
+	MOVL	$1024, AX
+	MOVL	AX, 0(SP)
+	CALL	mal(SB)
+
+	LEAQ	104(AX), BX
+	MOVQ	BX, 16(R15)		// 16(R15) is limit of istack (w 104b guard)
+
+	ADDQ	0(SP), AX
+	LEAQ	(-4*8)(AX), BX
+	MOVQ	BX, 24(R15)		// 24(R15) is base of istack (w auto*4)
 
 	CALL	check(SB)
 
-// process the arguments
+	// process the arguments
 
-	MOVL	16(SP), AX
+	MOVL	16(SP), AX		// copy argc
 	MOVL	AX, 0(SP)
-	MOVQ	24(SP), AX
+	MOVQ	24(SP), AX		// copy argv
 	MOVQ	AX, 8(SP)
 	CALL	args(SB)
 
@@ -38,15 +48,131 @@
 	MOVQ	AX, 0(SP)		// exit status
 	CALL	sys·exit(SB)
 
-	CALL	notok(SB)
-
-	ADDQ	$32, SP
+	CALL	notok(SB)		// fault
 	RET
 
+//
+// the calling sequence for a routine that
+// needs N bytes stack, A args.
+//
+//	N1 = (N+160 > 4096)? N+160: 0
+//	A1 = A
+//
+// if N <= 75
+//	CMPQ	SP, 0(R15)
+//	JHI	3(PC)
+//	MOVQ	$(N1<<0) | (A1<<32)), AX
+//	CALL	_morestack
+//
+// if N > 75
+//	LEAQ	(-N-75)(SP), AX
+//	CMPQ	AX, 0(R15)
+//	JHI	3(PC)
+//	MOVQ	$(N1<<0) | (A1<<32)), AX
+//	CALL	_morestack
+//
+
 TEXT	_morestack(SB), 7, $0
-	MOVQ	SP, AX
-	SUBQ	$1024, AX
+	// save stuff on interrupt stack
+
+	MOVQ	24(R15), BX		// istack
+	MOVQ	SP, 8(BX)		// old SP
+	MOVQ	AX, 16(BX)		// magic number
+	MOVQ	0(R15), AX		// old limit
+	MOVQ	AX, 24(BX)
+
+	// switch and set up new limit
+
+	MOVQ	BX, SP
+	MOVQ	16(R15), AX		// istack limit
 	MOVQ	AX, 0(R15)
+
+	// allocate a new stack max of request and 4k
+
+	MOVL	16(SP), AX		// magic number
+	CMPL	AX, $4096
+	JHI	2(PC)
+	MOVL	$4096, AX
+	MOVL	AX, 0(SP)
+	CALL	mal(SB)
+
+	// switch to new stack
+
+	MOVQ	SP, BX			// istack
+	ADDQ	$104, AX		// new stack limit
+	MOVQ	AX, 0(R15)
+	ADDQ	0(SP), AX
+	LEAQ	(-104-4*8)(AX), SP	// new SP
+	MOVQ	8(R15), AX
+	MOVQ	AX, 0(SP)		// old base
+	MOVQ	SP, 8(R15)		// new base
+
+	// copy needed stuff from istack to new stack
+
+	MOVQ	16(BX), AX		// magic number
+	MOVQ	AX, 16(SP)
+	MOVQ	24(BX), AX		// old limit
+	MOVQ	AX, 24(SP)
+	MOVQ	8(BX), AX		// old SP
+	MOVQ	AX, 8(SP)
+
+// are there parameters
+
+	MOVL	20(SP), CX		// copy count
+	CMPL	CX, $0
+	JEQ	easy
+
+// copy in
+
+	LEAQ	16(AX), SI
+	SUBQ	CX, SP
+	MOVQ	SP, DI
+	SHRL	$3, CX
+	CLD
+	REP
+	MOVSQ
+
+	// call the intended
+	CALL	0(AX)
+
+// copy out
+
+	MOVQ	SP, SI
+	MOVQ	8(R15), BX		// new base
+	MOVQ	8(BX), AX		// old SP
+	LEAQ	16(AX), DI
+	MOVL	20(BX), CX		// copy count
+	SHRL	$3, CX
+	CLD
+	REP
+	MOVSQ
+
+	// restore old SP and limit
+	MOVQ	8(R15), SP		// new base
+	MOVQ	24(SP), AX		// old limit
+	MOVQ	AX, 0(R15)
+	MOVQ	0(SP), AX
+	MOVQ	AX, 8(R15)		// old base
+	MOVQ	8(SP), AX		// old SP
+	MOVQ	AX, SP
+
+	// and return to the call behind mine
+	ADDQ	$8, SP
+	RET
+
+easy:
+	CALL	0(AX)
+
+	// restore old SP and limit
+	MOVQ	24(SP), AX		// old limit
+	MOVQ	AX, 0(R15)
+	MOVQ	0(SP), AX
+	MOVQ	AX, 8(R15)		// old base
+	MOVQ	8(SP), AX		// old SP
+	MOVQ	AX, SP
+
+	// and return to the call behind mine
+	ADDQ	$8, SP
 	RET
 
 TEXT	FLUSH(SB),7,$-8
@@ -145,7 +271,7 @@
 
 TEXT	sys·memclr(SB),1,$-8
 	MOVQ	8(SP), DI		// arg 1 addr
-	MOVL	16(SP), CX		// arg 2 count
+	MOVL	16(SP), CX		// arg 2 count (cannot be zero)
 	ADDL	$7, CX
 	SHRL	$3, CX
 	MOVQ	$0, AX
diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c
index 5ff3f85..52ffba7 100644
--- a/src/runtime/runtime.c
+++ b/src/runtime/runtime.c
@@ -183,7 +183,7 @@
 {
 	byte* v;
 
-	v = sys·mmap(nil, NHUNK, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0);
+	v = sys·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0);
 	sys·memclr(v, n);
 	nmmap += n;
 	return v;
@@ -194,10 +194,8 @@
 {
 	byte* v;
 
-	// round to keep everything 64-bit alligned
-	while(n & 7)
-		n++;
-
+	// round to keep everything 64-bit aligned
+	n = (n+7) & ~7;
 	nmal += n;
 
 	// do we have enough in contiguous hunk