Runtime is now starting up with a dummy c program as target:
- morestack and gosave/gogo/gocall support
- memclr and memset from inferno
- bugfixes in _rt0_arm

R=rsc
APPROVED=rsc
DELTA=304  (174 added, 36 deleted, 94 changed)
OCL=30636
CL=30642
diff --git a/src/cmd/5l/noop.c b/src/cmd/5l/noop.c
index f4de0a0..19fc567 100644
--- a/src/cmd/5l/noop.c
+++ b/src/cmd/5l/noop.c
@@ -141,8 +141,8 @@
 		}
 	}
 	// TODO(kaib): make lack of morestack an error
-// 	if(pmorestack == P)
-// 		diag("sys·morestack not defined");
+//	if(pmorestack == P)
+//		diag("sys·morestack not defined");
 
 	curframe = 0;
 	curbecome = 0;
@@ -356,27 +356,26 @@
 				p->link = q1;
 			} else if (autosize < StackBig) {
 				// split stack check for small functions
-				// MOVW			(REGG), R1
+				// MOVW			g_stackguard(g), R1
 				// CMP			R1, $-autosize(SP)
-				// MOVW.W.LT	R14,$-autosize(SP)
-				// MOVW.W.GE	R14,$-4(SP)
-				// MOVW.GE		$(args << 24 | autosize), R1
-				// BL.GE		callmorestack(SB)
+				// MOVW.LT		$args, R2
+				// MOVW.W.LT	R14, R3
+				// BL.LT		sys·morestackx(SB) // modifies LR
+				// MOVW.W		R14,$-autosize(SP)
 
-				// TODO(kaib): double check we allocate autosize after
-				// 				stack has been split
-				// TODO(kaib): add error in case autosize doesn't pack
 				// TODO(kaib): add more trampolines
 				// TODO(kaib): put stackguard in register
 				// TODO(kaib): add support for -K and underflow detection
 
-				p = appendp(p); // load G.stackguard into R1
+				// MOVW			g_stackguard(g), R1
+				p = appendp(p);
 				p->as = AMOVW;
 				p->from.type = D_OREG;
 				p->from.reg = REGG;
 				p->to.type = D_REG;
 				p->to.reg = 1;
 
+				// CMP			R1, $-autosize(SP)
 				p = appendp(p);
 				p->as = ACMP;
 				p->from.type = D_REG;
@@ -384,42 +383,41 @@
 				p->from.offset = -autosize;
 				p->reg = REGSP;
 
+				// MOVW.LT		$args, R2
 				p = appendp(p);
 				p->as = AMOVW;
- 				p->scond = C_SCOND_GE | C_WBIT;
-				p->from.type = D_REG;
-				p->from.reg = REGLINK;
-				p->to.type = D_OREG;
-				p->to.offset = -autosize;
-				p->to.reg = REGSP;
-
-				p = appendp(p);
-				p->as = AMOVW;
-				p->scond = C_SCOND_LT | C_WBIT;
-				p->from.type = D_REG;
-				p->from.reg = REGLINK;
-				p->to.type = D_OREG;
-				p->to.offset = -4;
-				p->to.reg = REGSP;
-
-				p = appendp(p); // packs args and autosize
-				p->as = AMOVW;
 				p->scond = C_SCOND_LT;
 				p->from.type = D_CONST;
-				// top 8 bits are arg count, lower 24 bits number of 4 byte
-				// words
-				p->from.offset =
-					(curtext->to.offset2 & ~7) << 21 |
-					(autosize & ~7) >> 3;
+				p->from.offset = curtext->to.offset2 & ~7;
 				p->to.type = D_REG;
-				p->to.reg = 1;
+				p->to.reg = 2;
 
+				// MOVW.W.LT	R14, R3
+				p = appendp(p);
+				p->as = AMOVW;
+				p->scond = C_SCOND_LT;
+				p->from.type = D_REG;
+				p->from.reg = REGLINK;
+				p->to.type = D_REG;
+				p->to.reg = 3;
+
+				// BL.LT		sys·morestackx(SB) // modifies LR
 				p = appendp(p);
 				p->as = ABL;
 				p->scond = C_SCOND_LT;
  				p->to.type = D_BRANCH;
 				p->to.sym = symmorestack;
 				p->cond = pmorestack;
+
+				// MOVW.W		R14,$-autosize(SP)
+				p = appendp(p);
+				p->as = AMOVW;
+ 				p->scond |= C_WBIT;
+				p->from.type = D_REG;
+				p->from.reg = REGLINK;
+				p->to.type = D_OREG;
+				p->to.offset = -autosize;
+				p->to.reg = REGSP;
 			} else { // > StackBig
 				// MOVW.W		R14,$-4(SP)
 				// MOVW			$(args << 24 | autosize), R1
diff --git a/src/pkg/runtime/Makefile b/src/pkg/runtime/Makefile
index 984c1f5..281dca9 100644
--- a/src/pkg/runtime/Makefile
+++ b/src/pkg/runtime/Makefile
@@ -10,7 +10,10 @@
 
 # Setup CFLAGS.  Add -D_64BIT on 64-bit platforms (sorry).
 CFLAGS_64=-D_64BIT
-CFLAGS=-I$(GOOS) -I$(GOOS)/$(GOARCH) -wF $(CFLAGS_$(SIZE))
+# TODO(kaib): fix register allocation to honor extern register so we
+# can enable optimizations again.
+CFLAGS_arm=-N
+CFLAGS=-I$(GOOS) -I$(GOOS)/$(GOARCH) -wF $(CFLAGS_$(SIZE)) $(CFLAGS_$(GOARCH))
 
 # Set O to right letter.
 O_386=8
@@ -33,6 +36,7 @@
 
 # arm-specific object files
 OFILES_arm=\
+	memset.$O\
 	vlop.$O\
 	vlrt.$O\
 
diff --git a/src/pkg/runtime/arm/asm.s b/src/pkg/runtime/arm/asm.s
index 5e68b72..39ac99e 100644
--- a/src/pkg/runtime/arm/asm.s
+++ b/src/pkg/runtime/arm/asm.s
@@ -2,51 +2,55 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-TEXT _rt0_arm(SB),7,$0
+#include "arm/asm.h"
+
+// using frame size $-4 means do not save LR on stack.
+TEXT _rt0_arm(SB),7,$-4
 	MOVW $setR12(SB), R12
 
 	// copy arguments forward on an even stack
-	MOVW	0(SP), R0		// argc
-	MOVW	4(SP), R1		// argv
-	SUB	$128, SP		// plenty of scratch
-	AND	$~7, SP
-	MOVW	R0, 120(SP)		// save argc, argv away
-	MOVW	R1, 124(SP)
+	// use R13 instead of SP to avoid linker rewriting the offsets
+	MOVW	0(R13), R0		// argc
+	MOVW	$4(R13), R1		// argv
+	SUB	$128, R13		// plenty of scratch
+	AND	$~7, R13
+	MOVW	R0, 120(R13)		// save argc, argv away
+	MOVW	R1, 124(R13)
 
 	// set up m and g registers
 	// g is R10, m is R9
-	MOVW	$g0(SB), R10
-	MOVW	$m0(SB), R9
+	MOVW	$g0(SB), g
+	MOVW	$m0(SB), m
 
 	// save m->g0 = g0
-	MOVW	R10, 0(R9)
+	MOVW	g, m_g0(m)
 
 	// create istack out of the OS stack
-	MOVW	$(-8192+104)(SP), R0
-	MOVW	R0, 0(R10)	// 0(g) is stack limit (w 104b guard)
-	MOVW	SP, 4(R10)	// 4(g) is base
+	MOVW	$(-8192+104)(R13), R0
+	MOVW	R0, g_stackguard(g)	// (w 104b guard)
+	MOVW	R13, g_stackbase(g)
 	BL	emptyfunc(SB)	// fault if stack check is wrong
 
 	BL	check(SB)
 
 	// saved argc, argv
-	MOVW	120(SP), R0
-	MOVW	R0, 0(SP)
-	MOVW	124(SP), R0
-	MOVW	R0, 4(SP)
+	MOVW	120(R13), R0
+	MOVW	R0, 4(R13)
+	MOVW	124(R13), R1
+	MOVW	R1, 8(R13)
 	BL	args(SB)
 	BL	osinit(SB)
 	BL	schedinit(SB)
 
 	// create a new goroutine to start program
 	MOVW	$mainstart(SB), R0
-	MOVW.W	R0, -4(SP)
+	MOVW.W	R0, -4(R13)
 	MOVW	$8, R0
-	MOVW.W	R0, -4(SP)
+	MOVW.W	R0, -4(R13)
 	MOVW	$0, R0
-	MOVW.W	R0, -4(SP)	// push $0 as guard
+	MOVW.W	R0, -4(R13)	// push $0 as guard
 	BL	sys·newproc(SB)
-	MOVW	$12(SP), SP	// pop args and LR
+	MOVW	$12(R13), R13	// pop args and LR
 
 	// start this M
 	BL	mstart(SB)
@@ -70,73 +74,106 @@
 // TODO(kaib): remove these once linker works properly
 // pull in dummy dependencies
 TEXT _dep_dummy(SB),7,$0
-	BL	sys·morestack(SB)
-	BL	sys·morestackx(SB)
 	BL	_div(SB)
 	BL	_divu(SB)
 	BL	_mod(SB)
 	BL	_modu(SB)
 	BL	_modu(SB)
 
-
 TEXT	breakpoint(SB),7,$0
 	BL	abort(SB)
 //	BYTE $0xcc
 //	RET
 
-// go-routine
-TEXT	gogo(SB), 7, $0
-	BL	abort(SB)
-//	MOVL	4(SP), AX	// gobuf
-//	MOVL	0(AX), SP	// restore SP
-//	MOVL	4(AX), AX
-//	MOVL	AX, 0(SP)	// put PC on the stack
-//	MOVL	$1, AX
-//	RET
+/*
+ *  go-routine
+ */
 
+// uintptr gosave(Gobuf*)
+// save state in Gobuf; setjmp
 TEXT gosave(SB), 7, $0
+	MOVW	SP, gobuf_sp(R0)
+	MOVW	LR, gobuf_pc(R0)
+	MOVW	g, gobuf_g(R0)
+	MOVW	$0, R0			// return 0
+	RET
+
+// void gogo(Gobuf*, uintptr)
+// restore state from Gobuf; longjmp
+TEXT	gogo(SB), 7, $0
+	MOVW	R0, R1			// gobuf
+	MOVW	8(SP), R0		// return 2nd arg
+	MOVW	gobuf_g(R1), g
+	MOVW	0(g), R2		// make sure g != nil
+	MOVW	gobuf_sp(R1), SP	// restore SP
+	MOVW	gobuf_pc(R1), PC
+
+// void gogocall(Gobuf*, void (*fn)(void))
+// restore state from Gobuf but then call fn.
+// (call fn, returning to state in Gobuf)
+// TODO(kaib): add R0 to gobuf so it can be restored properly
+// using frame size $-4 means do not save LR on stack.
+TEXT gogocall(SB), 7, $-4
+	MOVW	8(SP), R1		// fn
+	MOVW	gobuf_g(R0), g
+	MOVW	0(g), R2		// make sure g != nil
+	MOVW	gobuf_sp(R0), SP	// restore SP
+	MOVW	gobuf_pc(R0), LR
+	MOVW	R1, PC
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// R1 frame size
+// R2 arg size
+// R3 prolog's LR
+// using frame size $-4 means do not save LR on stack.
+TEXT sys·morestack(SB),7,$-4
+	// Cannot grow scheduler stack (m->g0).
+	MOVW	m_g0(m), R4
+	CMP	g, R4
+	BNE	2(PC)
 	BL	abort(SB)
-//	MOVL	4(SP), AX	// gobuf
-//	MOVL	SP, 0(AX)	// save SP
-//	MOVL	0(SP), BX
-//	MOVL	BX, 4(AX)	// save PC
-//	MOVL	$0, AX	// return 0
-//	RET
 
-// support for morestack
+	// Save in m.
+	MOVW	R1, m_moreframe(m)
+	MOVW	R2, m_moreargs(m)
 
-// return point when leaving new stack.
-// save R0, jmp to lesstack to switch back
-TEXT	retfromnewstack(SB),7,$0
-	MOVW	R0,12(R9)	// m->cret
-	B	lessstack(SB)
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVW	R3, (m_morebuf+gobuf_pc)(m) // f's caller's PC
+	MOVW	SP, (m_morebuf+gobuf_sp)(m) // f's caller's SP
+	MOVW	g, (m_morebuf+gobuf_g)(m)
 
-// gogo, returning 2nd arg instead of 1
-TEXT gogoret(SB), 7, $0
-	MOVW	8(SP), R0	// return 2nd arg
-	MOVW	4(SP), R1	// gobuf
-	MOVW	0(R1), SP	// restore SP
-	MOVW	4(R1), PC	// restore PC
+	// Set m->morepc to f's PC.
+	MOVW	LR, m_morepc(m)
 
-TEXT setspgoto(SB), 7, $0
-	MOVW	4(SP), R0	// SP
-	MOVW	8(SP), R1	// fn to call
-	MOVW	12(SP), R2	// fn to return into
-	MOVW	R2, R14		// restore LR
-	MOVW	R0, SP
-	MOVW	R1, PC		// goto
+	// Call newstack on m's scheduling stack.
+	MOVW	m_g0(m), g
+	MOVW	(m_sched+gobuf_sp)(m), SP
+	B	newstack(SB)
+
+// Return point when leaving stack.
+// using frame size $-4 means do not save LR on stack.
+TEXT sys·lessstack(SB), 7, $-4
+	// Save return value in m->cret
+	MOVW	R0, m_cret(m)
+
+	// Call oldstack on m's scheduling stack.
+	MOVW	m_g0(m), g
+	MOVW	(m_sched+gobuf_sp)(m), SP
+	B	oldstack(SB)
 
 // Optimization to make inline stack splitting code smaller
 // R0 is original first argument
-// R1 is arg_num << 24 | autosize >> 3
-TEXT sys·morestackx(SB), 7, $0
-	MOVW	R0, 4(SP)	// Save arg0
-	MOVW	R1<<8, R2
-	MOVW	R2>>5, R2
-	MOVW	R2, 4(R10)	// autooffset into g
-	MOVW	R1>>24, R2
-	MOVW	R2<<3, R2
-	MOVW	R2, 8(R10)	// argsize into g
+// R2 is argsize
+// R3 is LR for f (f's caller's PC)
+// using frame size $-4 means do not save LR on stack.
+TEXT sys·morestackx(SB), 7, $-4
+	MOVW	R0, 0(FP)	// Save arg0
+	MOVW	$0, R1		// set frame size
 	B	sys·morestack(SB)
 
 // bool cas(int32 *val, int32 old, int32 new)
@@ -180,17 +217,18 @@
 //	SUBL	$5, (SP)	// return to CALL again
 //	JMP	AX	// but first run the deferred function
 
-TEXT	sys·memclr(SB),7,$0
-	BL	abort(SB)
-//	MOVL	4(SP), DI		// arg 1 addr
-//	MOVL	8(SP), CX		// arg 2 count
-//	ADDL	$3, CX
-//	SHRL	$2, CX
-//	MOVL	$0, AX
-//	CLD
-//	REP
-//	STOSL
-//	RET
+TEXT	sys·memclr(SB),7,$20
+// R0 = addr and passes implicitly to memset
+	MOVW	$0, R1		// c = 0
+	MOVW	R1, -16(SP)
+	MOVW	4(FP), R1	// n
+	MOVW	R1, -12(SP)
+	MOVW	m, -8(SP)	// Save m and g
+	MOVW	g, -4(SP)
+	BL	memset(SB)
+	MOVW	-8(SP), m	// Restore m and g, memset clobbers them
+	MOVW	-4(SP), g
+	RET
 
 TEXT	sys·getcallerpc+0(SB),7,$0
 	BL	abort(SB)
diff --git a/src/pkg/runtime/arm/memset.s b/src/pkg/runtime/arm/memset.s
new file mode 100644
index 0000000..cce9453
--- /dev/null
+++ b/src/pkg/runtime/arm/memset.s
@@ -0,0 +1,94 @@
+// Inferno's libkern/memset-arm.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memset-arm.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+TO = 1
+TOE = 2
+N = 3
+TMP = 3					/* N and TMP don't overlap */
+
+// TODO(kaib): memset clobbers R9 and R10 (m and g). This makes the
+// registers unpredictable if (when) memset SIGSEGV's. Fix it by
+// moving the R4-R11 register bank.
+TEXT memset(SB), $0
+	MOVW	R0, R(TO)
+	MOVW	data+4(FP), R(4)
+	MOVW	n+8(FP), R(N)
+
+	ADD	R(N), R(TO), R(TOE)	/* to end pointer */
+
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_1tail
+
+	AND	$0xFF, R(4)		/* it's a byte */
+	SLL	$8, R(4), R(TMP)	/* replicate to a word */
+	ORR	R(TMP), R(4)
+	SLL	$16, R(4), R(TMP)
+	ORR	R(TMP), R(4)
+
+_4align:				/* align on 4 */
+	AND.S	$3, R(TO), R(TMP)
+	BEQ	_4aligned
+
+	MOVBU.P	R(4), 1(R(TO))		/* implicit write back */
+	B	_4align
+
+_4aligned:
+	SUB	$31, R(TOE), R(TMP)	/* do 32-byte chunks if possible */
+	CMP	R(TMP), R(TO)
+	BHS	_4tail
+
+	MOVW	R4, R5			/* replicate */
+	MOVW	R4, R6
+	MOVW	R4, R7
+	MOVW	R4, R8
+	MOVW	R4, R9
+	MOVW	R4, R10
+	MOVW	R4, R11
+
+_f32loop:
+	CMP	R(TMP), R(TO)
+	BHS	_4tail
+
+	MOVM.IA.W [R4-R11], (R(TO))
+	B	_f32loop
+
+_4tail:
+	SUB	$3, R(TOE), R(TMP)	/* do remaining words if possible */
+_4loop:
+	CMP	R(TMP), R(TO)
+	BHS	_1tail
+
+	MOVW.P	R(4), 4(R(TO))		/* implicit write back */
+	B	_4loop
+
+_1tail:
+	CMP	R(TO), R(TOE)
+	BEQ	_return
+
+	MOVBU.P	R(4), 1(R(TO))		/* implicit write back */
+	B	_1tail
+
+_return:
+	RET
diff --git a/src/pkg/runtime/linux/arm/sys.s b/src/pkg/runtime/linux/arm/sys.s
index 25e64a3..c61d08f 100644
--- a/src/pkg/runtime/linux/arm/sys.s
+++ b/src/pkg/runtime/linux/arm/sys.s
@@ -6,23 +6,28 @@
 // System calls and other sys.stuff for arm, Linux
 //
 
+#define SYS_BASE 0x00900000
+#define SYS_exit (SYS_BASE + 1)
+#define SYS_write (SYS_BASE + 4)
+#define SYS_mmap2 (SYS_BASE + 192)
+
 TEXT write(SB),7,$0
 	MOVW	8(SP), R1
 	MOVW	12(SP), R2
-    	SWI	$0x00900004  // syscall write
+    	SWI	$SYS_write
 	RET
 
 TEXT exit(SB),7,$0
-	SWI         $0x00900001 // exit value in R0
-
-TEXT sys·write(SB),7,$0
-	MOVW	8(SP), R1
-	MOVW	12(SP), R2
-    	SWI	$0x00900004  // syscall write
-	RET
+	// Exit value already in R0
+	SWI	$SYS_exit
 
 TEXT sys·mmap(SB),7,$0
-	BL  abort(SB)
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	12(FP), R3
+	MOVW	16(FP), R4
+	MOVW	20(FP), R5
+	SWI	$SYS_mmap2
 	RET
 
 // int64 futex(int32 *uaddr, int32 op, int32 val,