all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043
diff --git a/doc/asm.html b/doc/asm.html
index d44cb79..f4ef1e6 100644
--- a/doc/asm.html
+++ b/doc/asm.html
@@ -149,7 +149,7 @@
 <p>
 Instructions, registers, and assembler directives are always in UPPER CASE to remind you
 that assembly programming is a fraught endeavor.
-(Exceptions: the <code>m</code> and <code>g</code> register renamings on ARM.)
+(Exception: the <code>g</code> register renaming on ARM.)
 </p>
 
 <p>
@@ -344,7 +344,7 @@
 <h3 id="x86">32-bit Intel 386</h3>
 
 <p>
-The runtime pointers to the <code>m</code> and <code>g</code> structures are maintained
+The runtime pointer to the <code>g</code> structure is maintained
 through the value of an otherwise unused (as far as Go is concerned) register in the MMU.
 A OS-dependent macro <code>get_tls</code> is defined for the assembler if the source includes
 an architecture-dependent header file, like this:
@@ -356,14 +356,15 @@
 
 <p>
 Within the runtime, the <code>get_tls</code> macro loads its argument register
-with a pointer to a pair of words representing the <code>g</code> and <code>m</code> pointers.
+with a pointer to the <code>g</code> pointer, and the <code>g</code> struct
+contains the <code>m</code> pointer.
 The sequence to load <code>g</code> and <code>m</code> using <code>CX</code> looks like this:
 </p>
 
 <pre>
 get_tls(CX)
-MOVL	g(CX), AX	// Move g into AX.
-MOVL	m(CX), BX	// Move m into BX.
+MOVL	g(CX), AX     // Move g into AX.
+MOVL	g_m(AX), BX   // Move g->m into BX.
 </pre>
 
 <h3 id="amd64">64-bit Intel 386 (a.k.a. amd64)</h3>
@@ -376,22 +377,21 @@
 
 <pre>
 get_tls(CX)
-MOVQ	g(CX), AX	// Move g into AX.
-MOVQ	m(CX), BX	// Move m into BX.
+MOVQ	g(CX), AX     // Move g into AX.
+MOVQ	g_m(AX), BX   // Move g->m into BX.
 </pre>
 
 <h3 id="arm">ARM</h3>
 
 <p>
-The registers <code>R9</code>, <code>R10</code>, and <code>R11</code>
+The registers <code>R10</code> and <code>R11</code>
 are reserved by the compiler and linker.
 </p>
 
 <p>
-<code>R9</code> and <code>R10</code> point to the <code>m</code> (machine) and <code>g</code>
-(goroutine) structures, respectively.
-Within assembler source code, these pointers must be referred to as <code>m</code> and <code>g</code>;
-the names <code>R9</code> and <code>R10</code> are not recognized.
+<code>R10</code> points to the <code>g</code> (goroutine) structure.
+Within assembler source code, this pointer must be referred to as <code>g</code>;
+the name <code>R10</code> is not recognized.
 </p>
 
 <p>
diff --git a/include/link.h b/include/link.h
index 2484978..2067bec 100644
--- a/include/link.h
+++ b/include/link.h
@@ -390,7 +390,7 @@
 	LSym*	sym_mod;
 	LSym*	sym_modu;
 	LSym*	symmorestack[20];
-	LSym*	gmsym;
+	LSym*	tlsg;
 	LSym*	plan9tos;
 	Prog*	curp;
 	Prog*	printp;
diff --git a/src/cmd/5a/lex.c b/src/cmd/5a/lex.c
index 571fdf7..84a17d1 100644
--- a/src/cmd/5a/lex.c
+++ b/src/cmd/5a/lex.c
@@ -199,8 +199,8 @@
 	"R6",		LREG,	6,
 	"R7",		LREG,	7,
 	"R8",		LREG,	8,
-	"m",		LREG,	9, // avoid unintentionally clobber m/g using R9/R10
-	"g",		LREG,	10,
+	"R9",		LREG,	9,
+	"g",		LREG,	10, // avoid unintentionally clobber g using R10
 	"R11",		LREG,	11,
 	"R12",		LREG,	12,
 	"R13",		LREG,	13,
diff --git a/src/cmd/dist/buildruntime.c b/src/cmd/dist/buildruntime.c
index ba5993b..008554d 100644
--- a/src/cmd/dist/buildruntime.c
+++ b/src/cmd/dist/buildruntime.c
@@ -130,17 +130,14 @@
 	{"386", "",
 		"#define	get_tls(r)	MOVL TLS, r\n"
 		"#define	g(r)	0(r)(TLS*1)\n"
-		"#define	m(r)	4(r)(TLS*1)\n"
 	},
 	{"amd64p32", "",
 		"#define	get_tls(r)	MOVL TLS, r\n"
 		"#define	g(r)	0(r)(TLS*1)\n"
-		"#define	m(r)	4(r)(TLS*1)\n"
 	},
 	{"amd64", "",
 		"#define	get_tls(r)	MOVQ TLS, r\n"
 		"#define	g(r)	0(r)(TLS*1)\n"
-		"#define	m(r)	8(r)(TLS*1)\n"
 	},	
 
 	{"arm", "",
diff --git a/src/cmd/ld/data.c b/src/cmd/ld/data.c
index 55d0207..03b93c7 100644
--- a/src/cmd/ld/data.c
+++ b/src/cmd/ld/data.c
@@ -186,8 +186,8 @@
 		case R_TLS_LE:
 			if(linkmode == LinkExternal && iself && HEADTYPE != Hopenbsd) {
 				r->done = 0;
-				r->sym = ctxt->gmsym;
-				r->xsym = ctxt->gmsym;
+				r->sym = ctxt->tlsg;
+				r->xsym = ctxt->tlsg;
 				r->xadd = r->add;
 				o = 0;
 				if(thechar != '6')
@@ -200,8 +200,8 @@
 		case R_TLS_IE:
 			if(linkmode == LinkExternal && iself && HEADTYPE != Hopenbsd) {
 				r->done = 0;
-				r->sym = ctxt->gmsym;
-				r->xsym = ctxt->gmsym;
+				r->sym = ctxt->tlsg;
+				r->xsym = ctxt->tlsg;
 				r->xadd = r->add;
 				o = 0;
 				if(thechar != '6')
@@ -951,9 +951,9 @@
 		sect->len = datsize;
 	} else {
 		// Might be internal linking but still using cgo.
-		// In that case, the only possible STLSBSS symbol is tlsgm.
+		// In that case, the only possible STLSBSS symbol is runtime.tlsg.
 		// Give it offset 0, because it's the only thing here.
-		if(s != nil && s->type == STLSBSS && strcmp(s->name, "runtime.tlsgm") == 0) {
+		if(s != nil && s->type == STLSBSS && strcmp(s->name, "runtime.tlsg") == 0) {
 			s->value = 0;
 			s = s->next;
 		}
diff --git a/src/cmd/ld/lib.c b/src/cmd/ld/lib.c
index da6194e..ef638a6 100644
--- a/src/cmd/ld/lib.c
+++ b/src/cmd/ld/lib.c
@@ -177,7 +177,7 @@
 loadlib(void)
 {
 	int i, w, x;
-	LSym *s, *gmsym;
+	LSym *s, *tlsg;
 	char* cgostrsym;
 
 	if(flag_shared) {
@@ -244,12 +244,12 @@
 			}
 	}
 	
-	gmsym = linklookup(ctxt, "runtime.tlsgm", 0);
-	gmsym->type = STLSBSS;
-	gmsym->size = 2*PtrSize;
-	gmsym->hide = 1;
-	gmsym->reachable = 1;
-	ctxt->gmsym = gmsym;
+	tlsg = linklookup(ctxt, "runtime.tlsg", 0);
+	tlsg->type = STLSBSS;
+	tlsg->size = PtrSize;
+	tlsg->hide = 1;
+	tlsg->reachable = 1;
+	ctxt->tlsg = tlsg;
 
 	// Now that we know the link mode, trim the dynexp list.
 	x = CgoExportDynamic;
diff --git a/src/cmd/ld/symtab.c b/src/cmd/ld/symtab.c
index 6d321c0..1bc384e 100644
--- a/src/cmd/ld/symtab.c
+++ b/src/cmd/ld/symtab.c
@@ -198,13 +198,13 @@
 	genasmsym(putelfsym);
 	
 	if(linkmode == LinkExternal && HEADTYPE != Hopenbsd) {
-		s = linklookup(ctxt, "runtime.tlsgm", 0);
+		s = linklookup(ctxt, "runtime.tlsg", 0);
 		if(s->sect == nil) {
 			ctxt->cursym = nil;
 			diag("missing section for %s", s->name);
 			errorexit();
 		}
-		putelfsyment(putelfstr(s->name), 0, 2*PtrSize, (STB_LOCAL<<4)|STT_TLS, s->sect->elfsect->shnum, 0);
+		putelfsyment(putelfstr(s->name), 0, s->size, (STB_LOCAL<<4)|STT_TLS, s->sect->elfsect->shnum, 0);
 		s->elfsym = numelfsym++;
 	}
 
diff --git a/src/liblink/asm5.c b/src/liblink/asm5.c
index 465b645..2d08480 100644
--- a/src/liblink/asm5.c
+++ b/src/liblink/asm5.c
@@ -572,8 +572,8 @@
 	 * code references to be relocated too, and then
 	 * perhaps we'd be able to parallelize the span loop above.
 	 */
-	if(ctxt->gmsym == nil)
-		ctxt->gmsym = linklookup(ctxt, "runtime.tlsgm", 0);
+	if(ctxt->tlsg == nil)
+		ctxt->tlsg = linklookup(ctxt, "runtime.tlsg", 0);
 
 	p = cursym->text;
 	ctxt->autosize = p->to.offset + 4;
@@ -1377,11 +1377,11 @@
 			rel->sym = p->to.sym;
 			rel->add = p->to.offset;
 			
-			// runtime.tlsgm (aka gmsym) is special.
+			// runtime.tlsg is special.
 			// Its "address" is the offset from the TLS thread pointer
 			// to the thread-local g and m pointers.
 			// Emit a TLS relocation instead of a standard one.
-			if(rel->sym == ctxt->gmsym) {
+			if(rel->sym == ctxt->tlsg) {
 				rel->type = R_TLS;
 				if(ctxt->flag_shared)
 					rel->add += ctxt->pc - p->pcrel->pc - 8 - rel->siz;
diff --git a/src/liblink/obj5.c b/src/liblink/obj5.c
index ccd4c81..d473cb7 100644
--- a/src/liblink/obj5.c
+++ b/src/liblink/obj5.c
@@ -173,15 +173,15 @@
 	if(ctxt->flag_shared) {
 		// Shared libraries use R_ARM_TLS_IE32 instead of 
 		// R_ARM_TLS_LE32, replacing the link time constant TLS offset in
-		// runtime.tlsgm with an address to a GOT entry containing the 
-		// offset. Rewrite $runtime.tlsgm(SB) to runtime.tlsgm(SB) to
+		// runtime.tlsg with an address to a GOT entry containing the 
+		// offset. Rewrite $runtime.tlsg(SB) to runtime.tlsg(SB) to
 		// compensate.
-		if(ctxt->gmsym == nil)
-			ctxt->gmsym = linklookup(ctxt, "runtime.tlsgm", 0);
+		if(ctxt->tlsg == nil)
+			ctxt->tlsg = linklookup(ctxt, "runtime.tlsg", 0);
 
-		if(p->from.type == D_CONST && p->from.name == D_EXTERN && p->from.sym == ctxt->gmsym)
+		if(p->from.type == D_CONST && p->from.name == D_EXTERN && p->from.sym == ctxt->tlsg)
 			p->from.type = D_OREG;
-		if(p->to.type == D_CONST && p->to.name == D_EXTERN && p->to.sym == ctxt->gmsym)
+		if(p->to.type == D_CONST && p->to.name == D_EXTERN && p->to.sym == ctxt->tlsg)
 			p->to.type = D_OREG;
 	}
 }
diff --git a/src/liblink/obj6.c b/src/liblink/obj6.c
index b1bcd0d..d83f847 100644
--- a/src/liblink/obj6.c
+++ b/src/liblink/obj6.c
@@ -394,8 +394,8 @@
 	uint32 i;
 	vlong textstksiz, textarg;
 
-	if(ctxt->gmsym == nil)
-		ctxt->gmsym = linklookup(ctxt, "runtime.tlsgm", 0);
+	if(ctxt->tlsg == nil)
+		ctxt->tlsg = linklookup(ctxt, "runtime.tlsg", 0);
 	if(ctxt->symmorestack[0] == nil) {
 		if(nelem(morename) > nelem(ctxt->symmorestack))
 			sysfatal("Link.symmorestack needs at least %d elements", nelem(morename));
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 8ba72eb..0b09358 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -40,7 +40,7 @@
 	MOVL	_cgo_init(SB), AX
 	TESTL	AX, AX
 	JZ	needtls
-	MOVL	$setmg_gcc<>(SB), BX
+	MOVL	$setg_gcc<>(SB), BX
 	MOVL	BX, 4(SP)
 	MOVL	BP, 0(SP)
 	CALL	AX
@@ -72,10 +72,11 @@
 	LEAL	runtime·g0(SB), CX
 	MOVL	CX, g(BX)
 	LEAL	runtime·m0(SB), AX
-	MOVL	AX, m(BX)
 
 	// save m->g0 = g0
 	MOVL	CX, m_g0(AX)
+	// save g0->m = m0
+	MOVL	AX, g_m(CX)
 
 	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
 
@@ -178,7 +179,8 @@
 	MOVL	AX, (g_sched+gobuf_g)(AX)
 
 	// switch to m->g0 & its stack, call fn
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVL	m_g0(BX), SI
 	CMPL	SI, AX	// if g == m->g0 call badmcall
 	JNE	3(PC)
@@ -206,7 +208,8 @@
 TEXT runtime·morestack(SB),NOSPLIT,$0-0
 	// Cannot grow scheduler stack (m->g0).
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVL	m_g0(BX), SI
 	CMPL	g(CX), SI
 	JNE	2(PC)
@@ -258,7 +261,8 @@
 // func call(fn *byte, arg *byte, argsize uint32).
 TEXT runtime·newstackcall(SB), NOSPLIT, $0-12
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 
 	// Save our caller's state as the PC and SP to
 	// restore when returning from f.
@@ -415,7 +419,8 @@
 TEXT runtime·lessstack(SB), NOSPLIT, $0-0
 	// Save return value in m->cret
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVL	AX, m_cret(BX)
 
 	// Call oldstack on m->g0's stack.
@@ -606,7 +611,8 @@
 	// We get called to create new OS threads too, and those
 	// come in on the m->g0 stack already.
 	get_tls(CX)
-	MOVL	m(CX), BP
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
 	MOVL	m_g0(BP), SI
 	MOVL	g(CX), DI
 	CMPL	SI, DI
@@ -647,7 +653,7 @@
 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
 TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$12-12
-	// If m is nil, Go did not create the current thread.
+	// If g is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
@@ -656,19 +662,22 @@
 #ifdef GOOS_windows
 	MOVL	$0, BP
 	CMPL	CX, $0
-	JEQ	2(PC)
+	JEQ	2(PC) // TODO
 #endif
-	MOVL	m(CX), BP
-	MOVL	BP, DX // saved copy of oldm
+	MOVL	g(CX), BP
 	CMPL	BP, $0
-	JNE	havem
+	JEQ	needm
+	MOVL	g_m(BP), BP
+	MOVL	BP, DX // saved copy of oldm
+	JMP	havem
 needm:
-	MOVL	DX, 0(SP)
+	MOVL	$0, 0(SP)
 	MOVL	$runtime·needm(SB), AX
 	CALL	AX
 	MOVL	0(SP), DX
 	get_tls(CX)
-	MOVL	m(CX), BP
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
 
 havem:
 	// Now there's a valid m, and we're running on its m->g0.
@@ -718,7 +727,8 @@
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
 	// so we do not have to restore it.)
-	MOVL	m(CX), BP
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
 	MOVL	m_g0(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), SP
@@ -735,33 +745,28 @@
 	// Done!
 	RET
 
-// void setmg(M*, G*); set m and g. for use by needm.
-TEXT runtime·setmg(SB), NOSPLIT, $0-8
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVL	gg+0(FP), BX
 #ifdef GOOS_windows
-	MOVL	mm+0(FP), AX
-	CMPL	AX, $0
+	CMPL	BX, $0
 	JNE	settls
 	MOVL	$0, 0x14(FS)
 	RET
 settls:
+	MOVL	g_m(BX), AX
 	LEAL	m_tls(AX), AX
 	MOVL	AX, 0x14(FS)
 #endif
-	MOVL	mm+0(FP), AX
 	get_tls(CX)
-	MOVL	mm+0(FP), AX
-	MOVL	AX, m(CX)
-	MOVL	gg+4(FP), BX
 	MOVL	BX, g(CX)
 	RET
 
-// void setmg_gcc(M*, G*); set m and g. for use by gcc
-TEXT setmg_gcc<>(SB), NOSPLIT, $0
+// void setg_gcc(G*); set g. for use by gcc
+TEXT setg_gcc<>(SB), NOSPLIT, $0
 	get_tls(AX)
-	MOVL	mm+0(FP), DX
-	MOVL	DX, m(AX)
-	MOVL	gg+4(FP), DX
-	MOVL	DX,g (AX)
+	MOVL	gg+0(FP), DX
+	MOVL	DX, g(AX)
 	RET
 
 // check that SP is in range [g->stackbase, g->stackguard)
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s
index b712e34..4057f77 100644
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -40,7 +40,7 @@
 	JZ	needtls
 	// g0 already in DI
 	MOVQ	DI, CX	// Win64 uses CX for first parameter
-	MOVQ	$setmg_gcc<>(SB), SI
+	MOVQ	$setg_gcc<>(SB), SI
 	CALL	AX
 	// update stackguard after _cgo_init
 	MOVQ	$runtime·g0(SB), CX
@@ -73,10 +73,11 @@
 	LEAQ	runtime·g0(SB), CX
 	MOVQ	CX, g(BX)
 	LEAQ	runtime·m0(SB), AX
-	MOVQ	AX, m(BX)
 
 	// save m->g0 = g0
 	MOVQ	CX, m_g0(AX)
+	// save m0 to g0->m
+	MOVQ	AX, g_m(CX)
 
 	CLD				// convention is D is always left cleared
 	CALL	runtime·check(SB)
@@ -168,7 +169,8 @@
 	MOVQ	AX, (g_sched+gobuf_g)(AX)
 
 	// switch to m->g0 & its stack, call fn
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	m_g0(BX), SI
 	CMPQ	SI, AX	// if g == m->g0 call badmcall
 	JNE	3(PC)
@@ -236,7 +238,8 @@
 // func call(fn *byte, arg *byte, argsize uint32).
 TEXT runtime·newstackcall(SB), NOSPLIT, $0-20
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 
 	// Save our caller's state as the PC and SP to
 	// restore when returning from f.
@@ -392,7 +395,8 @@
 TEXT runtime·lessstack(SB), NOSPLIT, $0-0
 	// Save return value in m->cret
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	AX, m_cret(BX)
 
 	// Call oldstack on m->g0's stack.
@@ -406,7 +410,8 @@
 // morestack trampolines
 TEXT runtime·morestack00(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	$0, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVQ	$runtime·morestack(SB), AX
@@ -414,7 +419,8 @@
 
 TEXT runtime·morestack01(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	SHLQ	$32, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVQ	$runtime·morestack(SB), AX
@@ -422,7 +428,8 @@
 
 TEXT runtime·morestack10(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVLQZX	AX, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVQ	$runtime·morestack(SB), AX
@@ -430,7 +437,8 @@
 
 TEXT runtime·morestack11(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVQ	$runtime·morestack(SB), AX
 	JMP	AX
@@ -469,7 +477,8 @@
 
 TEXT morestack<>(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	SHLQ	$35, R8
 	MOVQ	R8, m_moreframesize(BX)
 	MOVQ	$runtime·morestack(SB), AX
@@ -678,7 +687,8 @@
 	// We get called to create new OS threads too, and those
 	// come in on the m->g0 stack already.
 	get_tls(CX)
-	MOVQ	m(CX), BP
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
 	MOVQ	m_g0(BP), SI
 	MOVQ	g(CX), DI
 	CMPQ	SI, DI
@@ -728,8 +738,8 @@
 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
 TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$8-24
-	// If m is nil, Go did not create the current thread.
-	// Call needm to obtain one for temporary use.
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one m for temporary use.
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call through AX.
@@ -739,17 +749,20 @@
 	CMPQ	CX, $0
 	JEQ	2(PC)
 #endif
-	MOVQ	m(CX), BP
-	MOVQ	BP, R8 // holds oldm until end of function
+	MOVQ	g(CX), BP
 	CMPQ	BP, $0
-	JNE	havem
+	JEQ	needm
+	MOVQ	g_m(BP), BP
+	MOVQ	BP, R8 // holds oldm until end of function
+	JMP	havem
 needm:
-	MOVQ	R8, 0(SP)
+	MOVQ	$0, 0(SP)
 	MOVQ	$runtime·needm(SB), AX
 	CALL	AX
 	MOVQ	0(SP), R8
 	get_tls(CX)
-	MOVQ	m(CX), BP
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
 
 havem:
 	// Now there's a valid m, and we're running on its m->g0.
@@ -798,7 +811,8 @@
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
 	// so we do not have to restore it.)
-	MOVQ	m(CX), BP
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
 	MOVQ	m_g0(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
@@ -815,30 +829,27 @@
 	// Done!
 	RET
 
-// void setmg(M*, G*); set m and g. for use by needm.
-TEXT runtime·setmg(SB), NOSPLIT, $0-16
-	MOVQ	mm+0(FP), AX
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-16
+	MOVQ	gg+0(FP), BX
 #ifdef GOOS_windows
-	CMPQ	AX, $0
+	CMPQ	BX, $0
 	JNE	settls
 	MOVQ	$0, 0x28(GS)
 	RET
 settls:
+	MOVQ	g_m(BX), AX
 	LEAQ	m_tls(AX), AX
 	MOVQ	AX, 0x28(GS)
 #endif
 	get_tls(CX)
-	MOVQ	mm+0(FP), AX
-	MOVQ	AX, m(CX)
-	MOVQ	gg+8(FP), BX
 	MOVQ	BX, g(CX)
 	RET
 
-// void setmg_gcc(M*, G*); set m and g called from gcc.
-TEXT setmg_gcc<>(SB),NOSPLIT,$0
+// void setg_gcc(G*); set g called from gcc.
+TEXT setg_gcc<>(SB),NOSPLIT,$0
 	get_tls(AX)
-	MOVQ	DI, m(AX)
-	MOVQ	SI, g(AX)
+	MOVQ	DI, g(AX)
 	RET
 
 // check that SP is in range [g->stackbase, g->stackguard)
diff --git a/src/pkg/runtime/asm_amd64p32.s b/src/pkg/runtime/asm_amd64p32.s
index b70ab69..a1cc631 100644
--- a/src/pkg/runtime/asm_amd64p32.s
+++ b/src/pkg/runtime/asm_amd64p32.s
@@ -53,10 +53,11 @@
 	LEAL	runtime·g0(SB), CX
 	MOVL	CX, g(BX)
 	LEAL	runtime·m0(SB), AX
-	MOVL	AX, m(BX)
 
 	// save m->g0 = g0
 	MOVL	CX, m_g0(AX)
+	// save m0 to g0->m
+	MOVL	AX, g_m(CX)
 
 	CLD				// convention is D is always left cleared
 	CALL	runtime·check(SB)
@@ -147,7 +148,8 @@
 	MOVL	AX, (g_sched+gobuf_g)(AX)
 
 	// switch to m->g0 & its stack, call fn
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVL	m_g0(BX), SI
 	CMPL	SI, AX	// if g == m->g0 call badmcall
 	JNE	3(PC)
@@ -215,7 +217,8 @@
 // func call(fn *byte, arg *byte, argsize uint32).
 TEXT runtime·newstackcall(SB), NOSPLIT, $0-20
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 
 	// Save our caller's state as the PC and SP to
 	// restore when returning from f.
@@ -358,7 +361,8 @@
 TEXT runtime·lessstack(SB), NOSPLIT, $0-0
 	// Save return value in m->cret
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVQ	AX, m_cret(BX)	// MOVQ, to save all 64 bits
 
 	// Call oldstack on m->g0's stack.
@@ -372,7 +376,8 @@
 // morestack trampolines
 TEXT runtime·morestack00(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVQ	$0, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVL	$runtime·morestack(SB), AX
@@ -380,7 +385,8 @@
 
 TEXT runtime·morestack01(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	SHLQ	$32, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVL	$runtime·morestack(SB), AX
@@ -388,7 +394,8 @@
 
 TEXT runtime·morestack10(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVLQZX	AX, AX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVL	$runtime·morestack(SB), AX
@@ -396,7 +403,8 @@
 
 TEXT runtime·morestack11(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	MOVQ	AX, m_moreframesize(BX)
 	MOVL	$runtime·morestack(SB), AX
 	JMP	AX
@@ -435,7 +443,8 @@
 
 TEXT morestack<>(SB),NOSPLIT,$0
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
 	SHLQ	$35, R8
 	MOVQ	R8, m_moreframesize(BX)
 	MOVL	$runtime·morestack(SB), AX
@@ -625,9 +634,9 @@
 	MOVL	0, AX
 	RET
 
-// void setmg(M*, G*); set m and g. for use by needm.
+// void setg(G*); set g. for use by needm.
 // Not implemented.
-TEXT runtime·setmg(SB), NOSPLIT, $0-8
+TEXT runtime·setg(SB), NOSPLIT, $0-8
 	MOVL	0, AX
 	RET
 
diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s
index bc0dd3c..7564e96 100644
--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -19,13 +19,15 @@
 	MOVW	R0, 60(R13)		// save argc, argv away
 	MOVW	R1, 64(R13)
 
-	// set up m and g registers
-	// g is R10, m is R9
+	// set up g register
+	// g is R10
 	MOVW	$runtime·g0(SB), g
-	MOVW	$runtime·m0(SB), m
+	MOVW	$runtime·m0(SB), R8
 
 	// save m->g0 = g0
-	MOVW	g, m_g0(m)
+	MOVW	g, m_g0(R8)
+	// save g->m = m0
+	MOVW	R8, g_m(g)
 
 	// create istack out of the OS stack
 	MOVW	$(-8192+104)(R13), R0
@@ -38,9 +40,9 @@
 	MOVW	_cgo_init(SB), R4
 	CMP	$0, R4
 	B.EQ	nocgo
-	BL		runtime·save_gm(SB);
+	BL		runtime·save_g(SB);
 	MOVW	g, R0 // first argument of _cgo_init is g
-	MOVW	$setmg_gcc<>(SB), R1 // second argument is address of save_gm
+	MOVW	$setg_gcc<>(SB), R1 // second argument is address of save_g
 	BL		(R4) // will clobber R0-R3
 
 nocgo:
@@ -124,7 +126,7 @@
 	MOVW	0(g), R2		// make sure g != nil
 	MOVB	runtime·iscgo(SB), R2
 	CMP 	$0, R2 // if in Cgo, we have to save g and m
-	BL.NE	runtime·save_gm(SB) // this call will clobber R0
+	BL.NE	runtime·save_g(SB) // this call will clobber R0
 	MOVW	gobuf_sp(R1), SP	// restore SP
 	MOVW	gobuf_lr(R1), LR
 	MOVW	gobuf_ret(R1), R0
@@ -142,8 +144,6 @@
 // Fn must never return.  It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB), NOSPLIT, $-4-4
-	MOVW	fn+0(FP), R0
-
 	// Save caller state in g->sched.
 	MOVW	SP, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_pc)(g)
@@ -153,10 +153,15 @@
 
 	// Switch to m->g0 & its stack, call fn.
 	MOVW	g, R1
-	MOVW	m_g0(m), g
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), g
 	CMP	g, R1
 	B.NE	2(PC)
 	B	runtime·badmcall(SB)
+	MOVB	runtime·iscgo(SB), R11
+	CMP	$0, R11
+	BL.NE	runtime·save_g(SB)
+	MOVW	fn+0(FP), R0
 	MOVW	(g_sched+gobuf_sp)(g), SP
 	SUB	$8, SP
 	MOVW	R1, 4(SP)
@@ -182,12 +187,13 @@
 // record an argument size. For that purpose, it has no arguments.
 TEXT runtime·morestack(SB),NOSPLIT,$-4-0
 	// Cannot grow scheduler stack (m->g0).
-	MOVW	m_g0(m), R4
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R4
 	CMP	g, R4
 	BL.EQ	runtime·abort(SB)
 
-	MOVW	R1, m_moreframesize(m)
-	MOVW	R2, m_moreargsize(m)
+	MOVW	R1, m_moreframesize(R8)
+	MOVW	R2, m_moreargsize(R8)
 
 	// Called from f.
 	// Set g->sched to context in f.
@@ -198,14 +204,14 @@
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
-	MOVW	R3, (m_morebuf+gobuf_pc)(m)	// f's caller's PC
-	MOVW	SP, (m_morebuf+gobuf_sp)(m)	// f's caller's SP
+	MOVW	R3, (m_morebuf+gobuf_pc)(R8)	// f's caller's PC
+	MOVW	SP, (m_morebuf+gobuf_sp)(R8)	// f's caller's SP
 	MOVW	$4(SP), R3			// f's argument pointer
-	MOVW	R3, m_moreargp(m)	
-	MOVW	g, (m_morebuf+gobuf_g)(m)
+	MOVW	R3, m_moreargp(R8)	
+	MOVW	g, (m_morebuf+gobuf_g)(R8)
 
 	// Call newstack on m->g0's stack.
-	MOVW	m_g0(m), g
+	MOVW	m_g0(R8), g
 	MOVW	(g_sched+gobuf_sp)(g), SP
 	BL	runtime·newstack(SB)
 
@@ -225,9 +231,10 @@
 TEXT runtime·newstackcall(SB), NOSPLIT, $-4-12
 	// Save our caller's state as the PC and SP to
 	// restore when returning from f.
-	MOVW	LR, (m_morebuf+gobuf_pc)(m)	// our caller's PC
-	MOVW	SP, (m_morebuf+gobuf_sp)(m)	// our caller's SP
-	MOVW	g,  (m_morebuf+gobuf_g)(m)
+	MOVW	g_m(g), R8
+	MOVW	LR, (m_morebuf+gobuf_pc)(R8)	// our caller's PC
+	MOVW	SP, (m_morebuf+gobuf_sp)(R8)	// our caller's SP
+	MOVW	g,  (m_morebuf+gobuf_g)(R8)
 
 	// Save our own state as the PC and SP to restore
 	// if this goroutine needs to be restarted.
@@ -246,14 +253,14 @@
 	MOVW	8(SP), R1			// arg frame
 	MOVW	12(SP), R2			// arg size
 
-	MOVW	R0, m_cret(m)			// f's PC
-	MOVW	R1, m_moreargp(m)		// f's argument pointer
-	MOVW	R2, m_moreargsize(m)		// f's argument size
+	MOVW	R0, m_cret(R8)			// f's PC
+	MOVW	R1, m_moreargp(R8)		// f's argument pointer
+	MOVW	R2, m_moreargsize(R8)		// f's argument size
 	MOVW	$1, R3
-	MOVW	R3, m_moreframesize(m)		// f's frame size
+	MOVW	R3, m_moreframesize(R8)		// f's frame size
 
 	// Call newstack on m->g0's stack.
-	MOVW	m_g0(m), g
+	MOVW	m_g0(R8), g
 	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·newstack(SB)
 
@@ -382,10 +389,11 @@
 // as morestack; in that context, it has 0 arguments.
 TEXT runtime·lessstack(SB), NOSPLIT, $-4-0
 	// Save return value in m->cret
-	MOVW	R0, m_cret(m)
+	MOVW	g_m(g), R8
+	MOVW	R0, m_cret(R8)
 
 	// Call oldstack on m->g0's stack.
-	MOVW	m_g0(m), g
+	MOVW	m_g0(R8), g
 	MOVW	(g_sched+gobuf_sp)(g), SP
 	BL	runtime·oldstack(SB)
 
@@ -430,7 +438,8 @@
 	// Figure out if we need to switch to m->g0 stack.
 	// We get called to create new OS threads too, and those
 	// come in on the m->g0 stack already.
-	MOVW	m_g0(m), R3
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R3
 	CMP	R3, g
 	BEQ	4(PC)
 	BL	gosave<>(SB)
@@ -470,26 +479,28 @@
 	// Load m and g from thread-local storage.
 	MOVB	runtime·iscgo(SB), R0
 	CMP	$0, R0
-	BL.NE	runtime·load_gm(SB)
+	BL.NE	runtime·load_g(SB)
 
-	// If m is nil, Go did not create the current thread.
+	// If g is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
-	MOVW	m, savedm-4(SP)
-	CMP	$0, m
+	CMP	$0, g
 	B.NE	havem
+	MOVW	g, savedm-4(SP) // g is zero, so is m.
 	MOVW	$runtime·needm(SB), R0
 	BL	(R0)
 
 havem:
+	MOVW	g_m(g), R8
+	MOVW	R8, savedm-4(SP)
 	// Now there's a valid m, and we're running on its m->g0.
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
 	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
-	MOVW	m_g0(m), R3
+	MOVW	m_g0(R8), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
 	MOVW	R4, savedsp-8(SP)
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
@@ -512,7 +523,7 @@
 	MOVW	fn+4(FP), R0
 	MOVW	frame+8(FP), R1
 	MOVW	framesize+12(FP), R2
-	MOVW	m_curg(m), g
+	MOVW	m_curg(R8), g
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVW	(g_sched+gobuf_pc)(g), R5
 	MOVW	R5, -12(R4)
@@ -528,7 +539,8 @@
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
 	// so we do not have to restore it.)
-	MOVW	m_g0(m), g
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), g
 	MOVW	(g_sched+gobuf_sp)(g), R13
 	MOVW	savedsp-8(SP), R4
 	MOVW	R4, (g_sched+gobuf_sp)(g)
@@ -544,15 +556,14 @@
 	// Done!
 	RET
 
-// void setmg(M*, G*); set m and g. for use by needm.
-TEXT runtime·setmg(SB), NOSPLIT, $0-8
-	MOVW	mm+0(FP), m
-	MOVW	gg+4(FP), g
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVW	gg+0(FP), g
 
-	// Save m and g to thread-local storage.
+	// Save g to thread-local storage.
 	MOVB	runtime·iscgo(SB), R0
 	CMP	$0, R0
-	BL.NE	runtime·save_gm(SB)
+	BL.NE	runtime·save_g(SB)
 
 	RET
 
@@ -685,40 +696,38 @@
 // Note: all three functions will clobber R0, and the last
 // two can be called from 5c ABI code.
 
-// save_gm saves the g and m registers into pthread-provided
+// save_g saves the g register into pthread-provided
 // thread-local memory, so that we can call externally compiled
 // ARM code that will overwrite those registers.
 // NOTE: runtime.gogo assumes that R1 is preserved by this function.
-TEXT runtime·save_gm(SB),NOSPLIT,$0
+//       runtime.mcall assumes this function only clobbers R0 and R11.
+TEXT runtime·save_g(SB),NOSPLIT,$0
 	MRC		15, 0, R0, C13, C0, 3 // fetch TLS base pointer
-	// $runtime.tlsgm(SB) is a special linker symbol.
+	// $runtime.tlsg(SB) is a special linker symbol.
 	// It is the offset from the TLS base pointer to our
-	// thread-local storage for g and m.
-	MOVW	$runtime·tlsgm(SB), R11
+	// thread-local storage for g.
+	MOVW	$runtime·tlsg(SB), R11
 	ADD	R11, R0
 	MOVW	g, 0(R0)
-	MOVW	m, 4(R0)
 	RET
 
-// load_gm loads the g and m registers from pthread-provided
+// load_g loads the g register from pthread-provided
 // thread-local memory, for use after calling externally compiled
 // ARM code that overwrote those registers.
-TEXT runtime·load_gm(SB),NOSPLIT,$0
+TEXT runtime·load_g(SB),NOSPLIT,$0
 	MRC		15, 0, R0, C13, C0, 3 // fetch TLS base pointer
-	// $runtime.tlsgm(SB) is a special linker symbol.
+	// $runtime.tlsg(SB) is a special linker symbol.
 	// It is the offset from the TLS base pointer to our
-	// thread-local storage for g and m.
-	MOVW	$runtime·tlsgm(SB), R11
+	// thread-local storage for g.
+	MOVW	$runtime·tlsg(SB), R11
 	ADD	R11, R0
 	MOVW	0(R0), g
-	MOVW	4(R0), m
 	RET
 
-// void setmg_gcc(M*, G*); set m and g called from gcc.
-TEXT setmg_gcc<>(SB),NOSPLIT,$0
-	MOVW	R0, m
-	MOVW	R1, g
-	B		runtime·save_gm(SB)
+// void setg_gcc(M*, G*); set m and g called from gcc.
+TEXT setg_gcc<>(SB),NOSPLIT,$0
+	MOVW	R0, g
+	B		runtime·save_g(SB)
 
 // TODO: share code with memeq?
 TEXT bytes·Equal(SB),NOSPLIT,$0
diff --git a/src/pkg/runtime/cgo/asm_arm.s b/src/pkg/runtime/cgo/asm_arm.s
index 850b1c6..b989ab9 100644
--- a/src/pkg/runtime/cgo/asm_arm.s
+++ b/src/pkg/runtime/cgo/asm_arm.s
@@ -14,11 +14,11 @@
 	 *  push 2 args for fn (R1 and R2).
 	 * Also note that at procedure entry in 5c/5g world, 4(R13) will be the
 	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
-	 *  Additionally, runtime·load_gm will clobber R0, so we need to save R0
+	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
 	 *  nevertheless.
 	 */
-	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, m, g, R11, R12, R14], (R13)
-	BL	runtime·load_gm(SB)
+	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	BL	runtime·load_g(SB)
 	MOVW	PC, R14
 	MOVW	0(R13), PC
-	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, m, g, R11, R12, PC]
+	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, PC]
diff --git a/src/pkg/runtime/cgo/callbacks.c b/src/pkg/runtime/cgo/callbacks.c
index e91c8bf..5a4889c 100644
--- a/src/pkg/runtime/cgo/callbacks.c
+++ b/src/pkg/runtime/cgo/callbacks.c
@@ -40,9 +40,9 @@
 
 	ret = runtime·mal(len);
 	c = runtime·mal(sizeof(*c));
-	c->next = m->cgomal;
+	c->next = g->m->cgomal;
 	c->alloc = ret;
-	m->cgomal = c;
+	g->m->cgomal = c;
 	FLUSH(&ret);
 }
 
diff --git a/src/pkg/runtime/cgo/gcc_arm.S b/src/pkg/runtime/cgo/gcc_arm.S
index 17e98d9..336f8ca 100644
--- a/src/pkg/runtime/cgo/gcc_arm.S
+++ b/src/pkg/runtime/cgo/gcc_arm.S
@@ -19,20 +19,19 @@
 .arch armv5t
 
 /*
- * void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void *m, void *g), void *m, void *g)
+ * void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
  * Calling into the 5c tool chain, where all registers are caller save.
  * Called from standard ARM EABI, where r4-r11 are callee-save, so they
  * must be saved explicitly.
  */
-.globl EXT(crosscall_arm2)
-EXT(crosscall_arm2):
+.globl EXT(crosscall_arm1)
+EXT(crosscall_arm1):
 	push {r4, r5, r6, r7, r8, r9, r10, r11, ip, lr}
 	mov r4, r0
 	mov r5, r1
 	mov r0, r2
-	mov r1, r3
-	blx r5 // setmg(m, g) 
+	blx r5 // setmg(g) 
 	blx r4 // fn() 
 	pop {r4, r5, r6, r7, r8, r9, r10, r11, ip, pc}
 
diff --git a/src/pkg/runtime/cgo/gcc_darwin_386.c b/src/pkg/runtime/cgo/gcc_darwin_386.c
index ad9fb5a..d1ef31e 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_386.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_386.c
@@ -8,46 +8,44 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static pthread_key_t k1, k2;
+static pthread_key_t k1;
 
 #define magic1 (0x23581321U)
 
 static void
 inittls(void)
 {
-	uint32 x, y;
+	uint32 x;
 	pthread_key_t tofree[128], k;
 	int i, ntofree;
-	int havek1, havek2;
 
 	/*
-	 * Allocate thread-local storage slots for m, g.
+	 * Allocate thread-local storage slot for g.
 	 * The key numbers start at 0x100, and we expect to be
 	 * one of the early calls to pthread_key_create, so we
-	 * should be able to get pretty low numbers.
+	 * should be able to get a pretty low number.
 	 *
 	 * In Darwin/386 pthreads, %gs points at the thread
 	 * structure, and each key is an index into the thread-local
 	 * storage array that begins at offset 0x48 within in that structure.
 	 * It may happen that we are not quite the first function to try
 	 * to allocate thread-local storage keys, so instead of depending
-	 * on getting 0x100 and 0x101, we try for 0x108 and 0x109,
-	 * allocating keys until we get the ones we want and then freeing
-	 * the ones we didn't want.
+	 * on getting 0x100, we try for 0x108, allocating keys until
+	 * we get the one we want and then freeing the ones we didn't want.
 	 *
-	 * Thus the final offsets to use in %gs references are
-	 * 0x48+4*0x108 = 0x468 and 0x48+4*0x109 = 0x46c.
+	 * Thus the final offset to use in %gs references is
+	 * 0x48+4*0x108 = 0x468.
 	 *
-	 * The linker and runtime hard-code these constant offsets
-	 * from %gs where we expect to find m and g.
-	 * Known to ../../../cmd/8l/obj.c:/468
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/468
 	 * and to ../sys_darwin_386.s:/468
 	 *
 	 * This is truly disgusting and a bit fragile, but taking care
 	 * of it here protects the rest of the system from damage.
 	 * The alternative would be to use a global variable that
 	 * held the offset and refer to that variable each time we
-	 * need a %gs variable (m or g).  That approach would
+	 * need a %gs variable (g).  That approach would
 	 * require an extra instruction and memory reference in
 	 * every stack growth prolog and would also require
 	 * rewriting the code that 8c generates for extern registers.
@@ -63,39 +61,32 @@
 	 * storage until we find a key that writes to the memory location
 	 * we want.  Then keep that key.
 	 */
-	havek1 = 0;
-	havek2 = 0;
 	ntofree = 0;
-	while(!havek1 || !havek2) {
+	for(;;) {
 		if(pthread_key_create(&k, nil) < 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
 		pthread_setspecific(k, (void*)magic1);
 		asm volatile("movl %%gs:0x468, %0" : "=r"(x));
-		asm volatile("movl %%gs:0x46c, %0" : "=r"(y));
-		if(x == magic1) {
-			havek1 = 1;
-			k1 = k;
-		} else if(y == magic1) {
-			havek2 = 1;
-			k2 = k;
-		} else {
-			if(ntofree >= nelem(tofree)) {
-				fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
-				fprintf(stderr, "\ttried");
-				for(i=0; i<ntofree; i++)
-					fprintf(stderr, " %#x", (unsigned)tofree[i]);
-				fprintf(stderr, "\n");
-				abort();
-			}
-			tofree[ntofree++] = k;
-		}
 		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
 	}
 
 	/*
-	 * We got the keys we wanted.  Free the others.
+	 * We got the key we wanted.  Free the others.
 	 */
 	for(i=0; i<ntofree; i++)
 		pthread_key_delete(tofree[i]);
@@ -158,7 +149,6 @@
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
 
 	pthread_setspecific(k1, (void*)ts.g);
-	pthread_setspecific(k2, (void*)ts.m);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_darwin_amd64.c b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
index 65d3816..358a281 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
@@ -8,64 +8,56 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static pthread_key_t k1, k2;
+static pthread_key_t k1;
 
 #define magic1 (0x23581321345589ULL)
 
 static void
 inittls(void)
 {
-	uint64 x, y;
+	uint64 x;
 	pthread_key_t tofree[128], k;
 	int i, ntofree;
-	int havek1, havek2;
 
 	/*
 	 * Same logic, code as darwin_386.c:/inittls, except that words
 	 * are 8 bytes long now, and the thread-local storage starts
-	 * at 0x60 on Leopard / Snow Leopard. So the offsets are
-	 * 0x60+8*0x108 = 0x8a0 and 0x60+8*0x109 = 0x8a8.
+	 * at 0x60 on Leopard / Snow Leopard. So the offset is
+	 * 0x60+8*0x108 = 0x8a0.
 	 *
-	 * The linker and runtime hard-code these constant offsets
-	 * from %gs where we expect to find m and g.
-	 * Known to ../../../cmd/6l/obj.c:/8a0
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/8a0
 	 * and to ../sys_darwin_amd64.s:/8a0
 	 *
 	 * As disgusting as on the 386; same justification.
 	 */
-	havek1 = 0;
-	havek2 = 0;
 	ntofree = 0;
-	while(!havek1 || !havek2) {
+	for(;;) {
 		if(pthread_key_create(&k, nil) < 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
 		pthread_setspecific(k, (void*)magic1);
 		asm volatile("movq %%gs:0x8a0, %0" : "=r"(x));
-		asm volatile("movq %%gs:0x8a8, %0" : "=r"(y));
-		if(x == magic1) {
-			havek1 = 1;
-			k1 = k;
-		} else if(y == magic1) {
-			havek2 = 1;
-			k2 = k;
-		} else {
-			if(ntofree >= nelem(tofree)) {
-				fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
-				fprintf(stderr, "\ttried");
-				for(i=0; i<ntofree; i++)
-					fprintf(stderr, " %#x", (unsigned)tofree[i]);
-				fprintf(stderr, "\n");
-				abort();
-			}
-			tofree[ntofree++] = k;
-		}
 		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
 	}
 
 	/*
-	 * We got the keys we wanted.  Free the others.
+	 * We got the key we wanted.  Free the others.
 	 */
 	for(i=0; i<ntofree; i++)
 		pthread_key_delete(tofree[i]);
@@ -128,7 +120,6 @@
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
 
 	pthread_setspecific(k1, (void*)ts.g);
-	pthread_setspecific(k2, (void*)ts.m);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_dragonfly_386.c b/src/pkg/runtime/cgo/gcc_dragonfly_386.c
index 695c166..6af61ac 100644
--- a/src/pkg/runtime/cgo/gcc_dragonfly_386.c
+++ b/src/pkg/runtime/cgo/gcc_dragonfly_386.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c b/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
index a46c121..a29d522 100644
--- a/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_386.c b/src/pkg/runtime/cgo/gcc_freebsd_386.c
index 695c166..6af61ac 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_386.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
index a46c121..a29d522 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_arm.c b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
index 6175e1d..16530f0 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_arm.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
@@ -21,15 +21,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -67,7 +67,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void *g, void *m);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -84,6 +84,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_linux_386.c b/src/pkg/runtime/cgo/gcc_linux_386.c
index 0a46c9b..5b282c9 100644
--- a/src/pkg/runtime/cgo/gcc_linux_386.c
+++ b/src/pkg/runtime/cgo/gcc_linux_386.c
@@ -8,15 +8,15 @@
 #include "libcgo.h"
 
 static void *threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -73,7 +73,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_linux_amd64.c b/src/pkg/runtime/cgo/gcc_linux_amd64.c
index c530183b..19ca580 100644
--- a/src/pkg/runtime/cgo/gcc_linux_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_linux_amd64.c
@@ -8,15 +8,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G* g, void (*setmg)(void*, void*))
+x_cgo_init(G* g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -68,7 +68,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_linux_arm.c b/src/pkg/runtime/cgo/gcc_linux_arm.c
index 0325681..3b108fe 100644
--- a/src/pkg/runtime/cgo/gcc_linux_arm.c
+++ b/src/pkg/runtime/cgo/gcc_linux_arm.c
@@ -9,15 +9,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -55,7 +55,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void*, void*);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -72,6 +72,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_386.c b/src/pkg/runtime/cgo/gcc_netbsd_386.c
index 28690cc..a2b7ef3 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_386.c
@@ -9,15 +9,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -69,7 +69,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_amd64.c b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
index 6e0482d..ccd08b7 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
@@ -9,15 +9,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_arm.c b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
index ba2ae25..5c0603d 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_arm.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
@@ -10,15 +10,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -51,7 +51,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void *g, void *m);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -68,6 +68,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_386.c b/src/pkg/runtime/cgo/gcc_openbsd_386.c
index e682c37..48b4bc7 100644
--- a/src/pkg/runtime/cgo/gcc_openbsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_openbsd_386.c
@@ -11,7 +11,7 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 // TCB_SIZE is sizeof(struct thread_control_block),
 // as defined in /usr/src/lib/librthread/tcb.h
@@ -83,13 +83,13 @@
 }
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 	void *handle;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -158,7 +158,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_amd64.c b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
index 64d29a9..5f0d3bb 100644
--- a/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
@@ -11,7 +11,7 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 // TCB_SIZE is sizeof(struct thread_control_block),
 // as defined in /usr/src/lib/librthread/tcb.h
@@ -83,13 +83,13 @@
 }
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 	void *handle;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -159,7 +159,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_windows_386.c b/src/pkg/runtime/cgo/gcc_windows_386.c
index cdc8664..0935b74 100644
--- a/src/pkg/runtime/cgo/gcc_windows_386.c
+++ b/src/pkg/runtime/cgo/gcc_windows_386.c
@@ -54,8 +54,7 @@
 		"movl %0, %%fs:0x14\n"	// MOVL tls0, 0x14(FS)
 		"movl %%fs:0x14, %%eax\n"	// MOVL 0x14(FS), tmp
 		"movl %1, 0(%%eax)\n"	// MOVL g, 0(FS)
-		"movl %2, 4(%%eax)\n"	// MOVL m, 4(FS)
-		:: "r"(ts.tls), "r"(ts.g), "r"(ts.m) : "%eax"
+		:: "r"(ts.tls), "r"(ts.g) : "%eax"
 	);
 	
 	crosscall_386(ts.fn);
diff --git a/src/pkg/runtime/cgo/gcc_windows_amd64.c b/src/pkg/runtime/cgo/gcc_windows_amd64.c
index d8dd69b..4a2540a 100644
--- a/src/pkg/runtime/cgo/gcc_windows_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_windows_amd64.c
@@ -54,8 +54,7 @@
 	  "movq %0, %%gs:0x28\n"	// MOVL tls0, 0x28(GS)
 	  "movq %%gs:0x28, %%rax\n" // MOVQ 0x28(GS), tmp
 	  "movq %1, 0(%%rax)\n" // MOVQ g, 0(GS)
-	  "movq %2, 8(%%rax)\n" // MOVQ m, 8(GS)
-	  :: "r"(ts.tls), "r"(ts.g), "r"(ts.m) : "%rax"
+	  :: "r"(ts.tls), "r"(ts.g) : "%rax"
 	);
 
 	crosscall_amd64(ts.fn);
diff --git a/src/pkg/runtime/cgo/libcgo.h b/src/pkg/runtime/cgo/libcgo.h
index 65ea3f3..251fb4c 100644
--- a/src/pkg/runtime/cgo/libcgo.h
+++ b/src/pkg/runtime/cgo/libcgo.h
@@ -32,7 +32,6 @@
 typedef struct ThreadStart ThreadStart;
 struct ThreadStart
 {
-	uintptr m;
 	G *g;
 	uintptr *tls;
 	void (*fn)(void);
diff --git a/src/pkg/runtime/cgocall.c b/src/pkg/runtime/cgocall.c
index 7b2ec26..aa4cf5e 100644
--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -112,7 +112,7 @@
 	if(runtime·needextram && runtime·cas(&runtime·needextram, 1, 0))
 		runtime·newextram();
 
-	m->ncgocall++;
+	g->m->ncgocall++;
 
 	/*
 	 * Lock g to m to ensure we stay on the same stack if we do a
@@ -126,7 +126,7 @@
 	d.special = true;
 	g->defer = &d;
 	
-	m->ncgo++;
+	g->m->ncgo++;
 
 	/*
 	 * Announce we are entering a system call
@@ -153,12 +153,12 @@
 endcgo(void)
 {
 	runtime·unlockOSThread();
-	m->ncgo--;
-	if(m->ncgo == 0) {
+	g->m->ncgo--;
+	if(g->m->ncgo == 0) {
 		// We are going back to Go and are not in a recursive
 		// call.  Let the GC collect any memory allocated via
 		// _cgo_allocate that is no longer referenced.
-		m->cgomal = nil;
+		g->m->cgomal = nil;
 	}
 
 	if(raceenabled)
@@ -210,12 +210,12 @@
 // On arm, stack frame is two words and there's a saved LR between
 // SP and the stack frame and between the stack frame and the arguments.
 #ifdef GOARCH_arm
-#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+4*sizeof(void*))
+#define CBARGS (CallbackArgs*)((byte*)g->m->g0->sched.sp+4*sizeof(void*))
 #endif
 
 // On amd64, stack frame is one word, plus caller PC.
 #ifdef GOARCH_amd64
-#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+2*sizeof(void*))
+#define CBARGS (CallbackArgs*)((byte*)g->m->g0->sched.sp+2*sizeof(void*))
 #endif
 
 // Unimplemented on amd64p32
@@ -225,7 +225,7 @@
 
 // On 386, stack frame is three words, plus caller PC.
 #ifdef GOARCH_386
-#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+4*sizeof(void*))
+#define CBARGS (CallbackArgs*)((byte*)g->m->g0->sched.sp+4*sizeof(void*))
 #endif
 
 void runtime·cgocallbackg1(void);
@@ -234,7 +234,7 @@
 void
 runtime·cgocallbackg(void)
 {
-	if(g != m->curg) {
+	if(g != g->m->curg) {
 		runtime·prints("runtime: bad g in cgocallback");
 		runtime·exit(2);
 	}
@@ -250,8 +250,8 @@
 	CallbackArgs *cb;
 	Defer d;
 
-	if(m->needextram) {
-		m->needextram = 0;
+	if(g->m->needextram) {
+		g->m->needextram = 0;
 		runtime·newextram();
 	}
 
@@ -291,10 +291,10 @@
 		runtime·throw("runtime: unwindm not implemented");
 	case '8':
 	case '6':
-		m->g0->sched.sp = *(uintptr*)m->g0->sched.sp;
+		g->m->g0->sched.sp = *(uintptr*)g->m->g0->sched.sp;
 		break;
 	case '5':
-		m->g0->sched.sp = *(uintptr*)((byte*)m->g0->sched.sp + 4);
+		g->m->g0->sched.sp = *(uintptr*)((byte*)g->m->g0->sched.sp + 4);
 		break;
 	}
 }
diff --git a/src/pkg/runtime/heapdump.c b/src/pkg/runtime/heapdump.c
index 744c59f..868f239 100644
--- a/src/pkg/runtime/heapdump.c
+++ b/src/pkg/runtime/heapdump.c
@@ -800,8 +800,8 @@
 {
 	// Stop the world.
 	runtime·semacquire(&runtime·worldsema, false);
-	m->gcing = 1;
-	m->locks++;
+	g->m->gcing = 1;
+	g->m->locks++;
 	runtime·stoptheworld();
 
 	// Update stats so we can dump them.
@@ -821,10 +821,10 @@
 	dumpfd = 0;
 
 	// Start up the world again.
-	m->gcing = 0;
+	g->m->gcing = 0;
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();
-	m->locks--;
+	g->m->locks--;
 }
 
 // Runs the specified gc program.  Calls the callback for every
diff --git a/src/pkg/runtime/lock_futex.c b/src/pkg/runtime/lock_futex.c
index c16ac90..7fc2d55 100644
--- a/src/pkg/runtime/lock_futex.c
+++ b/src/pkg/runtime/lock_futex.c
@@ -39,7 +39,7 @@
 {
 	uint32 i, v, wait, spin;
 
-	if(m->locks++ < 0)
+	if(g->m->locks++ < 0)
 		runtime·throw("runtime·lock: lock count");
 
 	// Speculative grab for lock.
@@ -99,9 +99,9 @@
 	if(v == MUTEX_SLEEPING)
 		runtime·futexwakeup((uint32*)&l->key, 1);
 
-	if(--m->locks < 0)
+	if(--g->m->locks < 0)
 		runtime·throw("runtime·unlock: lock count");
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 }
 
@@ -128,12 +128,12 @@
 void
 runtime·notesleep(Note *n)
 {
-	if(g != m->g0)
+	if(g != g->m->g0)
 		runtime·throw("notesleep not on g0");
 	while(runtime·atomicload((uint32*)&n->key) == 0) {
-		m->blocked = true;
+		g->m->blocked = true;
 		runtime·futexsleep((uint32*)&n->key, 0, -1);
-		m->blocked = false;
+		g->m->blocked = false;
 	}
 }
 
@@ -147,9 +147,9 @@
 
 	if(ns < 0) {
 		while(runtime·atomicload((uint32*)&n->key) == 0) {
-			m->blocked = true;
+			g->m->blocked = true;
 			runtime·futexsleep((uint32*)&n->key, 0, -1);
-			m->blocked = false;
+			g->m->blocked = false;
 		}
 		return true;
 	}
@@ -159,9 +159,9 @@
 
 	deadline = runtime·nanotime() + ns;
 	for(;;) {
-		m->blocked = true;
+		g->m->blocked = true;
 		runtime·futexsleep((uint32*)&n->key, 0, ns);
-		m->blocked = false;
+		g->m->blocked = false;
 		if(runtime·atomicload((uint32*)&n->key) != 0)
 			break;
 		now = runtime·nanotime();
@@ -177,7 +177,7 @@
 {
 	bool res;
 
-	if(g != m->g0 && !m->gcing)
+	if(g != g->m->g0 && !g->m->gcing)
 		runtime·throw("notetsleep not on g0");
 
 	res = notetsleep(n, ns, 0, 0);
@@ -191,7 +191,7 @@
 {
 	bool res;
 
-	if(g == m->g0)
+	if(g == g->m->g0)
 		runtime·throw("notetsleepg on g0");
 
 	runtime·entersyscallblock();
diff --git a/src/pkg/runtime/lock_sema.c b/src/pkg/runtime/lock_sema.c
index ff8fdfd..a4274e6 100644
--- a/src/pkg/runtime/lock_sema.c
+++ b/src/pkg/runtime/lock_sema.c
@@ -39,15 +39,15 @@
 	uintptr v;
 	uint32 i, spin;
 
-	if(m->locks++ < 0)
+	if(g->m->locks++ < 0)
 		runtime·throw("runtime·lock: lock count");
 
 	// Speculative grab for lock.
 	if(runtime·casp((void**)&l->key, nil, (void*)LOCKED))
 		return;
 
-	if(m->waitsema == 0)
-		m->waitsema = runtime·semacreate();
+	if(g->m->waitsema == 0)
+		g->m->waitsema = runtime·semacreate();
 
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
@@ -73,8 +73,8 @@
 			// for this lock, chained through m->nextwaitm.
 			// Queue this M.
 			for(;;) {
-				m->nextwaitm = (void*)(v&~LOCKED);
-				if(runtime·casp((void**)&l->key, (void*)v, (void*)((uintptr)m|LOCKED)))
+				g->m->nextwaitm = (void*)(v&~LOCKED);
+				if(runtime·casp((void**)&l->key, (void*)v, (void*)((uintptr)g->m|LOCKED)))
 					break;
 				v = (uintptr)runtime·atomicloadp((void**)&l->key);
 				if((v&LOCKED) == 0)
@@ -112,9 +112,9 @@
 		}
 	}
 
-	if(--m->locks < 0)
+	if(--g->m->locks < 0)
 		runtime·throw("runtime·unlock: lock count");
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 }
 
@@ -150,20 +150,20 @@
 void
 runtime·notesleep(Note *n)
 {
-	if(g != m->g0)
+	if(g != g->m->g0)
 		runtime·throw("notesleep not on g0");
 
-	if(m->waitsema == 0)
-		m->waitsema = runtime·semacreate();
-	if(!runtime·casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup)
+	if(g->m->waitsema == 0)
+		g->m->waitsema = runtime·semacreate();
+	if(!runtime·casp((void**)&n->key, nil, g->m)) {  // must be LOCKED (got wakeup)
 		if(n->key != LOCKED)
 			runtime·throw("notesleep - waitm out of sync");
 		return;
 	}
 	// Queued.  Sleep.
-	m->blocked = true;
+	g->m->blocked = true;
 	runtime·semasleep(-1);
-	m->blocked = false;
+	g->m->blocked = false;
 }
 
 #pragma textflag NOSPLIT
@@ -175,7 +175,7 @@
 	// does not count against our nosplit stack sequence.
 
 	// Register for wakeup on n->waitm.
-	if(!runtime·casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup already)
+	if(!runtime·casp((void**)&n->key, nil, g->m)) {  // must be LOCKED (got wakeup already)
 		if(n->key != LOCKED)
 			runtime·throw("notetsleep - waitm out of sync");
 		return true;
@@ -183,23 +183,23 @@
 
 	if(ns < 0) {
 		// Queued.  Sleep.
-		m->blocked = true;
+		g->m->blocked = true;
 		runtime·semasleep(-1);
-		m->blocked = false;
+		g->m->blocked = false;
 		return true;
 	}
 
 	deadline = runtime·nanotime() + ns;
 	for(;;) {
 		// Registered.  Sleep.
-		m->blocked = true;
+		g->m->blocked = true;
 		if(runtime·semasleep(ns) >= 0) {
-			m->blocked = false;
+			g->m->blocked = false;
 			// Acquired semaphore, semawakeup unregistered us.
 			// Done.
 			return true;
 		}
-		m->blocked = false;
+		g->m->blocked = false;
 
 		// Interrupted or timed out.  Still registered.  Semaphore not acquired.
 		ns = deadline - runtime·nanotime();
@@ -214,17 +214,17 @@
 	// try to grant us the semaphore when we don't expect it.
 	for(;;) {
 		mp = runtime·atomicloadp((void**)&n->key);
-		if(mp == m) {
+		if(mp == g->m) {
 			// No wakeup yet; unregister if possible.
 			if(runtime·casp((void**)&n->key, mp, nil))
 				return false;
 		} else if(mp == (M*)LOCKED) {
 			// Wakeup happened so semaphore is available.
 			// Grab it to avoid getting out of sync.
-			m->blocked = true;
+			g->m->blocked = true;
 			if(runtime·semasleep(-1) < 0)
 				runtime·throw("runtime: unable to acquire - semaphore out of sync");
-			m->blocked = false;
+			g->m->blocked = false;
 			return true;
 		} else
 			runtime·throw("runtime: unexpected waitm - semaphore out of sync");
@@ -236,11 +236,11 @@
 {
 	bool res;
 
-	if(g != m->g0 && !m->gcing)
+	if(g != g->m->g0 && !g->m->gcing)
 		runtime·throw("notetsleep not on g0");
 
-	if(m->waitsema == 0)
-		m->waitsema = runtime·semacreate();
+	if(g->m->waitsema == 0)
+		g->m->waitsema = runtime·semacreate();
 
 	res = notetsleep(n, ns, 0, nil);
 	return res;
@@ -253,11 +253,11 @@
 {
 	bool res;
 
-	if(g == m->g0)
+	if(g == g->m->g0)
 		runtime·throw("notetsleepg on g0");
 
-	if(m->waitsema == 0)
-		m->waitsema = runtime·semacreate();
+	if(g->m->waitsema == 0)
+		g->m->waitsema = runtime·semacreate();
 
 	runtime·entersyscallblock();
 	res = notetsleep(n, ns, 0, nil);
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index 7b7e350..0b56d1f 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -53,17 +53,17 @@
 		// have distinct values.
 		return &runtime·zerobase;
 	}
-	if(m->mallocing)
+	if(g->m->mallocing)
 		runtime·throw("malloc/free - deadlock");
 	// Disable preemption during settype.
 	// We can not use m->mallocing for this, because settype calls mallocgc.
-	m->locks++;
-	m->mallocing = 1;
+	g->m->locks++;
+	g->m->mallocing = 1;
 
 	if(DebugTypeAtBlockEnd)
 		size += sizeof(uintptr);
 
-	c = m->mcache;
+	c = g->m->mcache;
 	if(!runtime·debug.efence && size <= MaxSmallSize) {
 		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
 			// Tiny allocator.
@@ -112,9 +112,9 @@
 					v = (MLink*)tiny;
 					c->tiny += size1;
 					c->tinysize -= size1;
-					m->mallocing = 0;
-					m->locks--;
-					if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+					g->m->mallocing = 0;
+					g->m->locks--;
+					if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 						g->stackguard0 = StackPreempt;
 					return v;
 				}
@@ -178,7 +178,7 @@
 	if(DebugTypeAtBlockEnd)
 		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = typ;
 
-	m->mallocing = 0;
+	g->m->mallocing = 0;
 	// TODO: save type even if FlagNoScan?  Potentially expensive but might help
 	// heap profiling/tracing.
 	if(UseSpanType && !(flag & FlagNoScan) && typ != 0)
@@ -197,8 +197,8 @@
 			profilealloc(v, size);
 	}
 
-	m->locks--;
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 
 	if(!(flag & FlagNoInvokeGC) && mstats.heap_alloc >= mstats.next_gc)
@@ -239,7 +239,7 @@
 	int32 next;
 	MCache *c;
 
-	c = m->mcache;
+	c = g->m->mcache;
 	rate = runtime·MemProfileRate;
 	if(size < rate) {
 		// pick next profile time
@@ -279,9 +279,9 @@
 	// If you change this also change mgc0.c:/^sweep,
 	// which has a copy of the guts of free.
 
-	if(m->mallocing)
+	if(g->m->mallocing)
 		runtime·throw("malloc/free - deadlock");
-	m->mallocing = 1;
+	g->m->mallocing = 1;
 
 	if(!runtime·mlookup(v, nil, nil, &s)) {
 		runtime·printf("free %p: not an allocated block\n", v);
@@ -304,7 +304,7 @@
 	if(s->specials != nil)
 		runtime·freeallspecials(s, v, size);
 
-	c = m->mcache;
+	c = g->m->mcache;
 	if(sizeclass == 0) {
 		// Large object.
 		s->needzero = 1;
@@ -354,7 +354,7 @@
 			runtime·MCache_Free(c, v, sizeclass, size);
 		}
 	}
-	m->mallocing = 0;
+	g->m->mallocing = 0;
 }
 
 int32
@@ -364,11 +364,11 @@
 	byte *p;
 	MSpan *s;
 
-	m->mcache->local_nlookup++;
-	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
+	g->m->mcache->local_nlookup++;
+	if (sizeof(void*) == 4 && g->m->mcache->local_nlookup >= (1<<30)) {
 		// purge cache stats to prevent overflow
 		runtime·lock(&runtime·mheap);
-		runtime·purgecachedstats(m->mcache);
+		runtime·purgecachedstats(g->m->mcache);
 		runtime·unlock(&runtime·mheap);
 	}
 
@@ -569,7 +569,7 @@
 
 	// Initialize the rest of the allocator.	
 	runtime·MHeap_Init(&runtime·mheap);
-	m->mcache = runtime·allocmcache();
+	g->m->mcache = runtime·allocmcache();
 
 	// See if it works.
 	runtime·free(runtime·malloc(TinySize));
diff --git a/src/pkg/runtime/mcache.c b/src/pkg/runtime/mcache.c
index 26e3db2..13437a5 100644
--- a/src/pkg/runtime/mcache.c
+++ b/src/pkg/runtime/mcache.c
@@ -57,7 +57,7 @@
 	MCacheList *l;
 	MSpan *s;
 
-	m->locks++;
+	g->m->locks++;
 	// Return the current cached span to the central lists.
 	s = c->alloc[sizeclass];
 	if(s->freelist != nil)
@@ -83,7 +83,7 @@
 		runtime·throw("empty span");
 	}
 	c->alloc[sizeclass] = s;
-	m->locks--;
+	g->m->locks--;
 	return s;
 }
 
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 50d9fea..51c765e 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -757,7 +757,7 @@
 	}
 
 	// Initialize sbuf
-	scanbuffers = &bufferList[m->helpgc];
+	scanbuffers = &bufferList[g->m->helpgc];
 
 	sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0];
 	sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget);
@@ -1389,13 +1389,13 @@
 		if(work.nwait == work.nproc)
 			return nil;
 		if(i < 10) {
-			m->gcstats.nprocyield++;
+			g->m->gcstats.nprocyield++;
 			runtime·procyield(20);
 		} else if(i < 20) {
-			m->gcstats.nosyield++;
+			g->m->gcstats.nosyield++;
 			runtime·osyield();
 		} else {
-			m->gcstats.nsleep++;
+			g->m->gcstats.nsleep++;
 			runtime·usleep(100);
 		}
 	}
@@ -1413,8 +1413,8 @@
 	b->nobj -= n;
 	b1->nobj = n;
 	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-	m->gcstats.nhandoff++;
-	m->gcstats.nhandoffcnt += n;
+	g->m->gcstats.nhandoff++;
+	g->m->gcstats.nhandoffcnt += n;
 
 	// Put b on full list - let first half of b get stolen.
 	runtime·lfstackpush(&work.full, &b->node);
@@ -1487,7 +1487,7 @@
 					if(precise && (p < (byte*)PageSize || (uintptr)p == PoisonGC || (uintptr)p == PoisonStack)) {
 						// Looks like a junk value in a pointer slot.
 						// Liveness analysis wrong?
-						m->traceback = 2;
+						g->m->traceback = 2;
 						runtime·printf("bad pointer in frame %s at %p: %p\n", runtime·funcname(f), scanp, p);
 						runtime·throw("bad pointer in scanbitvector");
 					}
@@ -1533,7 +1533,7 @@
 					if(Debug > 2)
 						runtime·printf("frame %s @%p: slice %p/%D/%D\n", runtime·funcname(f), p, ((Slice*)p)->array, (int64)((Slice*)p)->len, (int64)((Slice*)p)->cap);
 					if(((Slice*)p)->cap < ((Slice*)p)->len) {
-						m->traceback = 2;
+						g->m->traceback = 2;
 						runtime·printf("bad slice in frame %s at %p: %p/%p/%p\n", runtime·funcname(f), p, ((byte**)p)[0], ((byte**)p)[1], ((byte**)p)[2]);
 						runtime·throw("slice capacity smaller than length");
 					}
@@ -1757,7 +1757,7 @@
 	// Caller must disable preemption.
 	// Otherwise when this function returns the span can become unswept again
 	// (if GC is triggered on another goroutine).
-	if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
+	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
 		runtime·throw("MSpan_EnsureSwept: m is not locked");
 
 	sg = runtime·mheap.sweepgen;
@@ -1794,7 +1794,7 @@
 
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
-	if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
+	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
 		runtime·throw("MSpan_Sweep: m is not locked");
 	sweepgen = runtime·mheap.sweepgen;
 	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
@@ -1815,7 +1815,7 @@
 	res = false;
 	nfree = 0;
 	end = &head;
-	c = m->mcache;
+	c = g->m->mcache;
 	sweepgenset = false;
 
 	// mark any free objects in this span so we don't collect them
@@ -2002,13 +2002,13 @@
 
 	// increment locks to ensure that the goroutine is not preempted
 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
-	m->locks++;
+	g->m->locks++;
 	sg = runtime·mheap.sweepgen;
 	for(;;) {
 		idx = runtime·xadd(&sweep.spanidx, 1) - 1;
 		if(idx >= sweep.nspan) {
 			runtime·mheap.sweepdone = true;
-			m->locks--;
+			g->m->locks--;
 			return -1;
 		}
 		s = sweep.spans[idx];
@@ -2023,7 +2023,7 @@
 		npages = s->npages;
 		if(!runtime·MSpan_Sweep(s))
 			npages = 0;
-		m->locks--;
+		g->m->locks--;
 		return npages;
 	}
 }
@@ -2107,7 +2107,7 @@
 {
 	uint32 nproc;
 
-	m->traceback = 2;
+	g->m->traceback = 2;
 	gchelperstart();
 
 	// parallel mark for over gc roots
@@ -2116,11 +2116,11 @@
 	// help other threads scan secondary blocks
 	scanblock(nil, true);
 
-	bufferList[m->helpgc].busy = 0;
+	bufferList[g->m->helpgc].busy = 0;
 	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
 	if(runtime·xadd(&work.ndone, +1) == nproc-1)
 		runtime·notewakeup(&work.alldone);
-	m->traceback = 0;
+	g->m->traceback = 0;
 }
 
 static void
@@ -2282,7 +2282,7 @@
 	// problems, don't bother trying to run gc
 	// while holding a lock.  The next mallocgc
 	// without a lock will do the gc instead.
-	if(!mstats.enablegc || g == m->g0 || m->locks > 0 || runtime·panicking)
+	if(!mstats.enablegc || g == g->m->g0 || g->m->locks > 0 || runtime·panicking)
 		return;
 
 	if(gcpercent == GcpercentUnknown) {	// first time through
@@ -2305,7 +2305,7 @@
 	// Ok, we're doing it!  Stop everybody else
 	a.start_time = runtime·nanotime();
 	a.eagersweep = force >= 2;
-	m->gcing = 1;
+	g->m->gcing = 1;
 	runtime·stoptheworld();
 	
 	clearpools();
@@ -2326,11 +2326,11 @@
 	}
 
 	// all done
-	m->gcing = 0;
-	m->locks++;
+	g->m->gcing = 0;
+	g->m->locks++;
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();
-	m->locks--;
+	g->m->locks--;
 
 	// now that gc is done, kick off finalizer thread if needed
 	if(!ConcurrentSweep) {
@@ -2360,17 +2360,17 @@
 	if(runtime·debug.allocfreetrace)
 		runtime·tracegc();
 
-	m->traceback = 2;
+	g->m->traceback = 2;
 	t0 = args->start_time;
 	work.tstart = args->start_time; 
 
 	if(CollectStats)
 		runtime·memclr((byte*)&gcstats, sizeof(gcstats));
 
-	m->locks++;	// disable gc during mallocs in parforalloc
+	g->m->locks++;	// disable gc during mallocs in parforalloc
 	if(work.markfor == nil)
 		work.markfor = runtime·parforalloc(MaxGcproc);
-	m->locks--;
+	g->m->locks--;
 
 	if(itabtype == nil) {
 		// get C pointer to the Go type "itab"
@@ -2407,7 +2407,7 @@
 	if(runtime·debug.gctrace)
 		t3 = runtime·nanotime();
 
-	bufferList[m->helpgc].busy = 0;
+	bufferList[g->m->helpgc].busy = 0;
 	if(work.nproc > 1)
 		runtime·notesleep(&work.alldone);
 
@@ -2515,7 +2515,7 @@
 		runtime·shrinkstack(runtime·allg[i]);
 
 	runtime·MProf_GC();
-	m->traceback = 0;
+	g->m->traceback = 0;
 }
 
 extern uintptr runtime·sizeof_C_MStats;
@@ -2528,17 +2528,17 @@
 	// one goroutine at a time, and there might be
 	// a pending garbage collection already calling it.
 	runtime·semacquire(&runtime·worldsema, false);
-	m->gcing = 1;
+	g->m->gcing = 1;
 	runtime·stoptheworld();
 	runtime·updatememstats(nil);
 	// Size of the trailing by_size array differs between Go and C,
 	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
 	runtime·memcopy(runtime·sizeof_C_MStats, stats, &mstats);
-	m->gcing = 0;
-	m->locks++;
+	g->m->gcing = 0;
+	g->m->locks++;
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();
-	m->locks--;
+	g->m->locks--;
 }
 
 void
@@ -2590,11 +2590,11 @@
 static void
 gchelperstart(void)
 {
-	if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
+	if(g->m->helpgc < 0 || g->m->helpgc >= MaxGcproc)
 		runtime·throw("gchelperstart: bad m->helpgc");
-	if(runtime·xchg(&bufferList[m->helpgc].busy, 1))
+	if(runtime·xchg(&bufferList[g->m->helpgc].busy, 1))
 		runtime·throw("gchelperstart: already busy");
-	if(g != m->g0)
+	if(g != g->m->g0)
 		runtime·throw("gchelper not running on g0 stack");
 }
 
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index 7e83eb2..961b32e 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -173,8 +173,8 @@
 	MSpan *s;
 
 	runtime·lock(h);
-	mstats.heap_alloc += m->mcache->local_cachealloc;
-	m->mcache->local_cachealloc = 0;
+	mstats.heap_alloc += g->m->mcache->local_cachealloc;
+	g->m->mcache->local_cachealloc = 0;
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
@@ -384,8 +384,8 @@
 runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 {
 	runtime·lock(h);
-	mstats.heap_alloc += m->mcache->local_cachealloc;
-	m->mcache->local_cachealloc = 0;
+	mstats.heap_alloc += g->m->mcache->local_cachealloc;
+	g->m->mcache->local_cachealloc = 0;
 	mstats.heap_inuse -= s->npages<<PageShift;
 	if(acct) {
 		mstats.heap_alloc -= s->npages<<PageShift;
@@ -656,7 +656,7 @@
 
 	// Ensure that the span is swept.
 	// GC accesses specials list w/o locks. And it's just much safer.
-	m->locks++;
+	g->m->locks++;
 	runtime·MSpan_EnsureSwept(span);
 
 	offset = (uintptr)p - (span->start << PageShift);
@@ -669,7 +669,7 @@
 	while((x = *t) != nil) {
 		if(offset == x->offset && kind == x->kind) {
 			runtime·unlock(&span->specialLock);
-			m->locks--;
+			g->m->locks--;
 			return false; // already exists
 		}
 		if(offset < x->offset || (offset == x->offset && kind < x->kind))
@@ -681,7 +681,7 @@
 	s->next = x;
 	*t = s;
 	runtime·unlock(&span->specialLock);
-	m->locks--;
+	g->m->locks--;
 	return true;
 }
 
@@ -701,7 +701,7 @@
 
 	// Ensure that the span is swept.
 	// GC accesses specials list w/o locks. And it's just much safer.
-	m->locks++;
+	g->m->locks++;
 	runtime·MSpan_EnsureSwept(span);
 
 	offset = (uintptr)p - (span->start << PageShift);
@@ -714,13 +714,13 @@
 		if(offset == s->offset && kind == s->kind) {
 			*t = s->next;
 			runtime·unlock(&span->specialLock);
-			m->locks--;
+			g->m->locks--;
 			return s;
 		}
 		t = &s->next;
 	}
 	runtime·unlock(&span->specialLock);
-	m->locks--;
+	g->m->locks--;
 	return nil;
 }
 
diff --git a/src/pkg/runtime/mprof.goc b/src/pkg/runtime/mprof.goc
index 9c23a16..0aea545 100644
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@@ -387,7 +387,7 @@
 
 	if(all) {
 		runtime·semacquire(&runtime·worldsema, false);
-		m->gcing = 1;
+		g->m->gcing = 1;
 		runtime·stoptheworld();
 	}
 
@@ -406,7 +406,7 @@
 	}
 	
 	if(all) {
-		m->gcing = 0;
+		g->m->gcing = 0;
 		runtime·semrelease(&runtime·worldsema);
 		runtime·starttheworld();
 	}
@@ -434,7 +434,7 @@
 	n = runtime·gcount();
 	if(n <= b.len) {
 		runtime·semacquire(&runtime·worldsema, false);
-		m->gcing = 1;
+		g->m->gcing = 1;
 		runtime·stoptheworld();
 
 		n = runtime·gcount();
@@ -450,7 +450,7 @@
 			}
 		}
 	
-		m->gcing = 0;
+		g->m->gcing = 0;
 		runtime·semrelease(&runtime·worldsema);
 		runtime·starttheworld();
 	}
@@ -480,22 +480,22 @@
 	Type *type;
 
 	runtime·lock(&tracelock);
-	m->traceback = 2;
+	g->m->traceback = 2;
 	type = (Type*)(typ & ~3);
 	name = typeinfoname(typ & 3);
 	if(type == nil)
 		runtime·printf("tracealloc(%p, %p, %s)\n", p, size, name);
 	else	
 		runtime·printf("tracealloc(%p, %p, %s of %S)\n", p, size, name, *type->string);
-	if(m->curg == nil || g == m->curg) {
+	if(g->m->curg == nil || g == g->m->curg) {
 		runtime·goroutineheader(g);
 		runtime·traceback((uintptr)runtime·getcallerpc(&p), (uintptr)runtime·getcallersp(&p), 0, g);
 	} else {
-		runtime·goroutineheader(m->curg);
-		runtime·traceback(~(uintptr)0, ~(uintptr)0, 0, m->curg);
+		runtime·goroutineheader(g->m->curg);
+		runtime·traceback(~(uintptr)0, ~(uintptr)0, 0, g->m->curg);
 	}
 	runtime·printf("\n");
-	m->traceback = 0;
+	g->m->traceback = 0;
 	runtime·unlock(&tracelock);
 }
 
@@ -503,12 +503,12 @@
 runtime·tracefree(void *p, uintptr size)
 {
 	runtime·lock(&tracelock);
-	m->traceback = 2;
+	g->m->traceback = 2;
 	runtime·printf("tracefree(%p, %p)\n", p, size);
 	runtime·goroutineheader(g);
 	runtime·traceback((uintptr)runtime·getcallerpc(&p), (uintptr)runtime·getcallersp(&p), 0, g);
 	runtime·printf("\n");
-	m->traceback = 0;
+	g->m->traceback = 0;
 	runtime·unlock(&tracelock);
 }
 
@@ -516,12 +516,12 @@
 runtime·tracegc(void)
 {
 	runtime·lock(&tracelock);
-	m->traceback = 2;
+	g->m->traceback = 2;
 	runtime·printf("tracegc()\n");
 	// running on m->g0 stack; show all non-g0 goroutines
 	runtime·tracebackothers(g);
 	runtime·printf("end tracegc\n");
 	runtime·printf("\n");
-	m->traceback = 0;
+	g->m->traceback = 0;
 	runtime·unlock(&tracelock);
 }
diff --git a/src/pkg/runtime/netpoll_solaris.c b/src/pkg/runtime/netpoll_solaris.c
index a2631a8..569aee5 100644
--- a/src/pkg/runtime/netpoll_solaris.c
+++ b/src/pkg/runtime/netpoll_solaris.c
@@ -79,7 +79,7 @@
 extern uintptr libc·port_dissociate;
 extern uintptr libc·port_getn;
 
-#define errno (*m->perrno)
+#define errno (*g->m->perrno)
 
 int32
 runtime·fcntl(int32 fd, int32 cmd, uintptr arg)
diff --git a/src/pkg/runtime/netpoll_windows.c b/src/pkg/runtime/netpoll_windows.c
index f3cd15c..4528292 100644
--- a/src/pkg/runtime/netpoll_windows.c
+++ b/src/pkg/runtime/netpoll_windows.c
@@ -102,16 +102,16 @@
 		if(n < 8)
 			n = 8;
 		if(block)
-			m->blocked = true;
+			g->m->blocked = true;
 		if(runtime·stdcall(runtime·GetQueuedCompletionStatusEx, 6, iocphandle, entries, (uintptr)n, &n, (uintptr)wait, (uintptr)0) == 0) {
-			m->blocked = false;
+			g->m->blocked = false;
 			errno = runtime·getlasterror();
 			if(!block && errno == WAIT_TIMEOUT)
 				return nil;
 			runtime·printf("netpoll: GetQueuedCompletionStatusEx failed (errno=%d)\n", errno);
 			runtime·throw("netpoll: GetQueuedCompletionStatusEx failed");
 		}
-		m->blocked = false;
+		g->m->blocked = false;
 		for(i = 0; i < n; i++) {
 			op = entries[i].op;
 			errno = 0;
@@ -125,9 +125,9 @@
 		errno = 0;
 		qty = 0;
 		if(block)
-			m->blocked = true;
+			g->m->blocked = true;
 		if(runtime·stdcall(runtime·GetQueuedCompletionStatus, 5, iocphandle, &qty, &key, &op, (uintptr)wait) == 0) {
-			m->blocked = false;
+			g->m->blocked = false;
 			errno = runtime·getlasterror();
 			if(!block && errno == WAIT_TIMEOUT)
 				return nil;
@@ -137,7 +137,7 @@
 			}
 			// dequeued failed IO packet, so report that
 		}
-		m->blocked = false;
+		g->m->blocked = false;
 		handlecompletion(&gp, op, errno, qty);
 	}
 	if(block && gp == nil)
diff --git a/src/pkg/runtime/os_darwin.c b/src/pkg/runtime/os_darwin.c
index 33a2df95..c660fb8 100644
--- a/src/pkg/runtime/os_darwin.c
+++ b/src/pkg/runtime/os_darwin.c
@@ -119,6 +119,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -127,7 +128,7 @@
 runtime·minit(void)
 {
 	// Initialize signal handling.
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 
 	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
 }
@@ -202,9 +203,9 @@
 	uint32 port;
 	CodeMsg *c;
 
-	if((port = m->machport) == 0){
+	if((port = g->m->machport) == 0){
 		port = runtime·mach_reply_port();
-		m->machport = port;
+		g->m->machport = port;
 	}
 
 	h->msgh_bits |= MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE);
@@ -405,14 +406,14 @@
 
 	if(ns >= 0) {
 		secs = runtime·timediv(ns, 1000000000, &nsecs);
-		r = runtime·mach_semaphore_timedwait(m->waitsema, secs, nsecs);
+		r = runtime·mach_semaphore_timedwait(g->m->waitsema, secs, nsecs);
 		if(r == KERN_ABORTED || r == KERN_OPERATION_TIMED_OUT)
 			return -1;
 		if(r != 0)
 			macherror(r, "semaphore_wait");
 		return 0;
 	}
-	while((r = runtime·mach_semaphore_wait(m->waitsema)) != 0) {
+	while((r = runtime·mach_semaphore_wait(g->m->waitsema)) != 0) {
 		if(r == KERN_ABORTED)	// interrupted
 			continue;
 		macherror(r, "semaphore_wait");
diff --git a/src/pkg/runtime/os_dragonfly.c b/src/pkg/runtime/os_dragonfly.c
index e7fd2cc..b8c967a 100644
--- a/src/pkg/runtime/os_dragonfly.c
+++ b/src/pkg/runtime/os_dragonfly.c
@@ -148,6 +148,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -156,7 +157,7 @@
 runtime·minit(void)
 {
 	// Initialize signal handling
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(&sigset_none, nil);
 }
 
diff --git a/src/pkg/runtime/os_freebsd.c b/src/pkg/runtime/os_freebsd.c
index 02b1347..9299dbd 100644
--- a/src/pkg/runtime/os_freebsd.c
+++ b/src/pkg/runtime/os_freebsd.c
@@ -156,6 +156,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -164,7 +165,7 @@
 runtime·minit(void)
 {
 	// Initialize signal handling
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(&sigset_none, nil);
 }
 
diff --git a/src/pkg/runtime/os_linux.c b/src/pkg/runtime/os_linux.c
index 8a94524..31cbdb0 100644
--- a/src/pkg/runtime/os_linux.c
+++ b/src/pkg/runtime/os_linux.c
@@ -196,6 +196,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -204,7 +205,7 @@
 runtime·minit(void)
 {
 	// Initialize signal handling.
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·rtsigprocmask(SIG_SETMASK, &sigset_none, nil, sizeof(Sigset));
 }
 
diff --git a/src/pkg/runtime/os_nacl.c b/src/pkg/runtime/os_nacl.c
index 3196e2c..b3e0fc6 100644
--- a/src/pkg/runtime/os_nacl.c
+++ b/src/pkg/runtime/os_nacl.c
@@ -20,6 +20,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -30,7 +31,7 @@
 	int32 ret;
 
 	// Initialize signal handling
-	ret = runtime·nacl_exception_stack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	ret = runtime·nacl_exception_stack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	if(ret < 0)
 		runtime·printf("runtime: nacl_exception_stack: error %d\n", -ret);
 
@@ -54,7 +55,7 @@
 runtime·osinit(void)
 {
 	runtime·ncpu = 1;
-	m->procid = 2;
+	g->m->procid = 2;
 //runtime·nacl_exception_handler(runtime·sigtramp, nil);
 }
 
@@ -126,7 +127,7 @@
 		runtime·printf("nacl_cond_create: error %d\n", -cond);
 		runtime·throw("semacreate");
 	}
-	m->waitsemalock = mu;
+	g->m->waitsemalock = mu;
 	return cond; // assigned to m->waitsema
 }
 
@@ -136,20 +137,20 @@
 {
 	int32 ret;
 	
-	ret = runtime·nacl_mutex_lock(m->waitsemalock);
+	ret = runtime·nacl_mutex_lock(g->m->waitsemalock);
 	if(ret < 0) {
 		//runtime·printf("nacl_mutex_lock: error %d\n", -ret);
 		runtime·throw("semasleep");
 	}
-	if(m->waitsemacount > 0) {
-		m->waitsemacount = 0;
-		runtime·nacl_mutex_unlock(m->waitsemalock);
+	if(g->m->waitsemacount > 0) {
+		g->m->waitsemacount = 0;
+		runtime·nacl_mutex_unlock(g->m->waitsemalock);
 		return 0;
 	}
 
-	while(m->waitsemacount == 0) {
+	while(g->m->waitsemacount == 0) {
 		if(ns < 0) {
-			ret = runtime·nacl_cond_wait(m->waitsema, m->waitsemalock);
+			ret = runtime·nacl_cond_wait(g->m->waitsema, g->m->waitsemalock);
 			if(ret < 0) {
 				//runtime·printf("nacl_cond_wait: error %d\n", -ret);
 				runtime·throw("semasleep");
@@ -159,9 +160,9 @@
 			
 			ns += runtime·nanotime();
 			ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
-			ret = runtime·nacl_cond_timed_wait_abs(m->waitsema, m->waitsemalock, &ts);
+			ret = runtime·nacl_cond_timed_wait_abs(g->m->waitsema, g->m->waitsemalock, &ts);
 			if(ret == -ETIMEDOUT) {
-				runtime·nacl_mutex_unlock(m->waitsemalock);
+				runtime·nacl_mutex_unlock(g->m->waitsemalock);
 				return -1;
 			}
 			if(ret < 0) {
@@ -171,8 +172,8 @@
 		}
 	}
 			
-	m->waitsemacount = 0;
-	runtime·nacl_mutex_unlock(m->waitsemalock);
+	g->m->waitsemacount = 0;
+	runtime·nacl_mutex_unlock(g->m->waitsemalock);
 	return 0;
 }
 
@@ -275,4 +276,4 @@
 int8 runtime·nacl_irt_thread_v0_1_str[] = "nacl-irt-thread-0.1";
 void *runtime·nacl_irt_thread_v0_1[3]; // thread_create, thread_exit, thread_nice
 int32 runtime·nacl_irt_thread_v0_1_size = sizeof(runtime·nacl_irt_thread_v0_1);
-*/
\ No newline at end of file
+*/
diff --git a/src/pkg/runtime/os_netbsd.c b/src/pkg/runtime/os_netbsd.c
index 93229bf..0889181 100644
--- a/src/pkg/runtime/os_netbsd.c
+++ b/src/pkg/runtime/os_netbsd.c
@@ -70,12 +70,12 @@
 	Timespec ts;
 
 	// spin-mutex lock
-	while(runtime·xchg(&m->waitsemalock, 1))
+	while(runtime·xchg(&g->m->waitsemalock, 1))
 		runtime·osyield();
 
 	for(;;) {
 		// lock held
-		if(m->waitsemacount == 0) {
+		if(g->m->waitsemacount == 0) {
 			// sleep until semaphore != 0 or timeout.
 			// thrsleep unlocks m->waitsemalock.
 			if(ns < 0) {
@@ -92,8 +92,8 @@
 				// the NetBSD kernel does not appear to provide
 				// a mechanism for unlocking the userspace
 				// mutex once the thread is actually parked.
-				runtime·atomicstore(&m->waitsemalock, 0);
-				runtime·lwp_park(nil, 0, &m->waitsemacount, nil);
+				runtime·atomicstore(&g->m->waitsemalock, 0);
+				runtime·lwp_park(nil, 0, &g->m->waitsemacount, nil);
 			} else {
 				ns = ns + runtime·nanotime();
 				// NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system.
@@ -101,20 +101,20 @@
 				ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
 				// TODO(jsing) - potential deadlock!
 				// See above for details.
-				runtime·atomicstore(&m->waitsemalock, 0);
-				runtime·lwp_park(&ts, 0, &m->waitsemacount, nil);
+				runtime·atomicstore(&g->m->waitsemalock, 0);
+				runtime·lwp_park(&ts, 0, &g->m->waitsemacount, nil);
 			}
 			// reacquire lock
-			while(runtime·xchg(&m->waitsemalock, 1))
+			while(runtime·xchg(&g->m->waitsemalock, 1))
 				runtime·osyield();
 		}
 
 		// lock held (again)
-		if(m->waitsemacount != 0) {
+		if(g->m->waitsemacount != 0) {
 			// semaphore is available.
-			m->waitsemacount--;
+			g->m->waitsemacount--;
 			// spin-mutex unlock
-			runtime·atomicstore(&m->waitsemalock, 0);
+			runtime·atomicstore(&g->m->waitsemalock, 0);
 			return 0;  // semaphore acquired
 		}
 
@@ -127,7 +127,7 @@
 
 	// lock held but giving up
 	// spin-mutex unlock
-	runtime·atomicstore(&m->waitsemalock, 0);
+	runtime·atomicstore(&g->m->waitsemalock, 0);
 	return -1;
 }
 
@@ -214,6 +214,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -221,10 +222,10 @@
 void
 runtime·minit(void)
 {
-	m->procid = runtime·lwp_self();
+	g->m->procid = runtime·lwp_self();
 
 	// Initialize signal handling
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
 }
 
diff --git a/src/pkg/runtime/os_openbsd.c b/src/pkg/runtime/os_openbsd.c
index 08a290a..2200915 100644
--- a/src/pkg/runtime/os_openbsd.c
+++ b/src/pkg/runtime/os_openbsd.c
@@ -67,34 +67,34 @@
 	Timespec ts;
 
 	// spin-mutex lock
-	while(runtime·xchg(&m->waitsemalock, 1))
+	while(runtime·xchg(&g->m->waitsemalock, 1))
 		runtime·osyield();
 
 	for(;;) {
 		// lock held
-		if(m->waitsemacount == 0) {
+		if(g->m->waitsemacount == 0) {
 			// sleep until semaphore != 0 or timeout.
 			// thrsleep unlocks m->waitsemalock.
 			if(ns < 0)
-				runtime·thrsleep(&m->waitsemacount, 0, nil, &m->waitsemalock, nil);
+				runtime·thrsleep(&g->m->waitsemacount, 0, nil, &g->m->waitsemalock, nil);
 			else {
 				ns += runtime·nanotime();
 				// NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system.
 				ts.tv_nsec = 0;
 				ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
-				runtime·thrsleep(&m->waitsemacount, CLOCK_MONOTONIC, &ts, &m->waitsemalock, nil);
+				runtime·thrsleep(&g->m->waitsemacount, CLOCK_MONOTONIC, &ts, &g->m->waitsemalock, nil);
 			}
 			// reacquire lock
-			while(runtime·xchg(&m->waitsemalock, 1))
+			while(runtime·xchg(&g->m->waitsemalock, 1))
 				runtime·osyield();
 		}
 
 		// lock held (again)
-		if(m->waitsemacount != 0) {
+		if(g->m->waitsemacount != 0) {
 			// semaphore is available.
-			m->waitsemacount--;
+			g->m->waitsemacount--;
 			// spin-mutex unlock
-			runtime·atomicstore(&m->waitsemalock, 0);
+			runtime·atomicstore(&g->m->waitsemalock, 0);
 			return 0;  // semaphore acquired
 		}
 
@@ -107,7 +107,7 @@
 
 	// lock held but giving up
 	// spin-mutex unlock
-	runtime·atomicstore(&m->waitsemalock, 0);
+	runtime·atomicstore(&g->m->waitsemalock, 0);
 	return -1;
 }
 
@@ -193,6 +193,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -201,7 +202,7 @@
 runtime·minit(void)
 {
 	// Initialize signal handling
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(SIG_SETMASK, sigset_none);
 }
 
diff --git a/src/pkg/runtime/os_plan9.c b/src/pkg/runtime/os_plan9.c
index 14d4fae..61f315a 100644
--- a/src/pkg/runtime/os_plan9.c
+++ b/src/pkg/runtime/os_plan9.c
@@ -88,7 +88,7 @@
 runtime·osinit(void)
 {
 	runtime·ncpu = getproccount();
-	m->procid = getpid();
+	g->m->procid = getpid();
 	runtime·notify(runtime·sigtramp);
 }
 
@@ -285,13 +285,13 @@
 		ms = runtime·timediv(ns, 1000000, nil);
 		if(ms == 0)
 			ms = 1;
-		ret = runtime·plan9_tsemacquire(&m->waitsemacount, ms);
+		ret = runtime·plan9_tsemacquire(&g->m->waitsemacount, ms);
 		if(ret == 1)
 			return 0;  // success
 		return -1;  // timeout or interrupted
 	}
 
-	while(runtime·plan9_semacquire(&m->waitsemacount, 1) < 0) {
+	while(runtime·plan9_semacquire(&g->m->waitsemacount, 1) < 0) {
 		/* interrupted; try again (c.f. lock_sema.c) */
 	}
 	return 0;  // success
@@ -360,7 +360,7 @@
 	switch(g->sig) {
 	case SIGRFAULT:
 	case SIGWFAULT:
-		p = runtime·strstr((byte*)m->notesig, (byte*)"addr=")+5;
+		p = runtime·strstr((byte*)g->m->notesig, (byte*)"addr=")+5;
 		g->sigcode1 = atolwhex(p);
 		if(g->sigcode1 < 0x1000 || g->paniconfault) {
 			if(g->sigpc == 0)
@@ -373,7 +373,7 @@
 	case SIGTRAP:
 		if(g->paniconfault)
 			runtime·panicstring("invalid memory address or nil pointer dereference");
-		runtime·throw(m->notesig);
+		runtime·throw(g->m->notesig);
 		break;
 	case SIGINTDIV:
 		runtime·panicstring("integer divide by zero");
@@ -382,7 +382,7 @@
 		runtime·panicstring("floating point error");
 		break;
 	default:
-		runtime·panicstring(m->notesig);
+		runtime·panicstring(g->m->notesig);
 		break;
 	}
 }
diff --git a/src/pkg/runtime/os_plan9_386.c b/src/pkg/runtime/os_plan9_386.c
index 80d711f..3490862 100644
--- a/src/pkg/runtime/os_plan9_386.c
+++ b/src/pkg/runtime/os_plan9_386.c
@@ -73,7 +73,7 @@
 	if(flags & SigPanic) {
 		// Copy the error string from sigtramp's stack into m->notesig so
 		// we can reliably access it from the panic routines.
-		runtime·memmove(m->notesig, note, len+1);
+		runtime·memmove(g->m->notesig, note, len+1);
 
 		gp->sig = sig;
 		gp->sigpc = ureg->pc;
@@ -104,8 +104,8 @@
 		return NCONT;
 
 Throw:
-	m->throwing = 1;
-	m->caughtsig = gp;
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
 	runtime·startpanic();
 
 	runtime·printf("%s\n", note);
@@ -146,5 +146,5 @@
 {
 	// TODO: Enable profiling interrupts.
 	
-	m->profilehz = hz;
+	g->m->profilehz = hz;
 }
diff --git a/src/pkg/runtime/os_plan9_amd64.c b/src/pkg/runtime/os_plan9_amd64.c
index a4e5ba8..6b0f8ae 100644
--- a/src/pkg/runtime/os_plan9_amd64.c
+++ b/src/pkg/runtime/os_plan9_amd64.c
@@ -81,7 +81,7 @@
 	if(flags & SigPanic) {
 		// Copy the error string from sigtramp's stack into m->notesig so
 		// we can reliably access it from the panic routines.
-		runtime·memmove(m->notesig, note, len+1);
+		runtime·memmove(g->m->notesig, note, len+1);
 
 		gp->sig = sig;
 		gp->sigpc = ureg->ip;
@@ -112,8 +112,8 @@
 		return NCONT;
 
 Throw:
-	m->throwing = 1;
-	m->caughtsig = gp;
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
 	runtime·startpanic();
 
 	runtime·printf("%s\n", note);
@@ -154,5 +154,5 @@
 {
 	// TODO: Enable profiling interrupts.
 	
-	m->profilehz = hz;
+	g->m->profilehz = hz;
 }
diff --git a/src/pkg/runtime/os_solaris.c b/src/pkg/runtime/os_solaris.c
index c6bbea3..4ef17f9 100644
--- a/src/pkg/runtime/os_solaris.c
+++ b/src/pkg/runtime/os_solaris.c
@@ -102,14 +102,14 @@
 uintptr
 runtime·sysvicall6(uintptr fn, int32 count, ...)
 {
-	runtime·memclr((byte*)&m->scratch, sizeof(m->scratch));
-	m->libcall.fn = (void*)fn;
-	m->libcall.n = (uintptr)count;
+	runtime·memclr((byte*)&g->m->scratch, sizeof(g->m->scratch));
+	g->m->libcall.fn = (void*)fn;
+	g->m->libcall.n = (uintptr)count;
 	for(;count; count--)
-		m->scratch.v[count - 1] = *((uintptr*)&count + count);
-	m->libcall.args = (uintptr*)&m->scratch.v[0];
-	runtime·asmcgocall(runtime·asmsysvicall6, &m->libcall);
-	return m->libcall.r1;
+		g->m->scratch.v[count - 1] = *((uintptr*)&count + count);
+	g->m->libcall.args = (uintptr*)&g->m->scratch.v[0];
+	runtime·asmcgocall(runtime·asmsysvicall6, &g->m->libcall);
+	return g->m->libcall.r1;
 }
 
 static int32
@@ -187,6 +187,7 @@
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
 }
 
 // Called to initialize a new m (including the bootstrap m).
@@ -196,7 +197,7 @@
 {
 	runtime·asmcgocall(runtime·miniterrno, (void *)libc·___errno);
 	// Initialize signal handling
-	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)g->m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
 }
 
@@ -337,13 +338,13 @@
 	// Call libc's malloc rather than runtime·malloc.  This will
 	// allocate space on the C heap.  We can't call runtime·malloc
 	// here because it could cause a deadlock.
-	m->libcall.fn = (void*)libc·malloc;
-	m->libcall.n = 1;
-	runtime·memclr((byte*)&m->scratch, sizeof(m->scratch));
-	m->scratch.v[0] = (uintptr)sizeof(*sem);
-	m->libcall.args = (uintptr*)&m->scratch;
-	runtime·asmcgocall(runtime·asmsysvicall6, &m->libcall);
-	sem = (void*)m->libcall.r1;
+	g->m->libcall.fn = (void*)libc·malloc;
+	g->m->libcall.n = 1;
+	runtime·memclr((byte*)&g->m->scratch, sizeof(g->m->scratch));
+	g->m->scratch.v[0] = (uintptr)sizeof(*sem);
+	g->m->libcall.args = (uintptr*)&g->m->scratch;
+	runtime·asmcgocall(runtime·asmsysvicall6, &g->m->libcall);
+	sem = (void*)g->m->libcall.r1;
 	if(runtime·sem_init(sem, 0, 0) != 0)
 		runtime·throw("sem_init");
 	return (uintptr)sem;
@@ -353,6 +354,9 @@
 int32
 runtime·semasleep(int64 ns)
 {
+	M *m;
+
+	m = g->m;
 	if(ns >= 0) {
 		m->ts.tv_sec = ns / 1000000000LL;
 		m->ts.tv_nsec = ns % 1000000000LL;
diff --git a/src/pkg/runtime/os_windows.c b/src/pkg/runtime/os_windows.c
index 0dd44ed..159af04 100644
--- a/src/pkg/runtime/os_windows.c
+++ b/src/pkg/runtime/os_windows.c
@@ -202,7 +202,7 @@
 		if(ns == 0)
 			ns = 1;
 	}
-	if(runtime·stdcall(runtime·WaitForSingleObject, 2, m->waitsema, (uintptr)ns) != 0)
+	if(runtime·stdcall(runtime·WaitForSingleObject, 2, g->m->waitsema, (uintptr)ns) != 0)
 		return -1;  // timeout
 	return 0;
 }
@@ -256,7 +256,7 @@
 	runtime·stdcall(runtime·DuplicateHandle, 7,
 		(uintptr)-1, (uintptr)-2, (uintptr)-1, &thandle,
 		(uintptr)0, (uintptr)0, (uintptr)DUPLICATE_SAME_ACCESS);
-	runtime·atomicstorep(&m->thread, thandle);
+	runtime·atomicstorep(&g->m->thread, thandle);
 }
 
 // Called from dropm to undo the effect of an minit.
@@ -295,20 +295,20 @@
 void *
 runtime·stdcall(void *fn, int32 count, ...)
 {
-	m->libcall.fn = fn;
-	m->libcall.n = count;
-	m->libcall.args = (uintptr*)&count + 1;
-	if(m->profilehz != 0) {
+	g->m->libcall.fn = fn;
+	g->m->libcall.n = count;
+	g->m->libcall.args = (uintptr*)&count + 1;
+	if(g->m->profilehz != 0) {
 		// leave pc/sp for cpu profiler
-		m->libcallg = g;
-		m->libcallpc = (uintptr)runtime·getcallerpc(&fn);
+		g->m->libcallg = g;
+		g->m->libcallpc = (uintptr)runtime·getcallerpc(&fn);
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
-		m->libcallsp = (uintptr)runtime·getcallersp(&fn);
+		g->m->libcallsp = (uintptr)runtime·getcallersp(&fn);
 	}
-	runtime·asmcgocall(runtime·asmstdcall, &m->libcall);
-	m->libcallsp = 0;
-	return (void*)m->libcall.r1;
+	runtime·asmcgocall(runtime·asmstdcall, &g->m->libcall);
+	g->m->libcallsp = 0;
+	return (void*)g->m->libcall.r1;
 }
 
 extern void runtime·usleep1(uint32);
@@ -484,7 +484,7 @@
 	}
 	runtime·stdcall(runtime·SetWaitableTimer, 6,
 		profiletimer, &due, (uintptr)ms, nil, nil, nil);
-	runtime·atomicstore((uint32*)&m->profilehz, hz);
+	runtime·atomicstore((uint32*)&g->m->profilehz, hz);
 }
 
 void
diff --git a/src/pkg/runtime/os_windows_386.c b/src/pkg/runtime/os_windows_386.c
index c36a001..7ee7ded 100644
--- a/src/pkg/runtime/os_windows_386.c
+++ b/src/pkg/runtime/os_windows_386.c
@@ -100,9 +100,9 @@
 		info->ExceptionInformation[0], info->ExceptionInformation[1], r->Eip);
 
 	runtime·printf("PC=%x\n", r->Eip);
-	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
 		runtime·printf("signal arrived during cgo execution\n");
-		gp = m->lockedg;
+		gp = g->m->lockedg;
 	}
 	runtime·printf("\n");
 
diff --git a/src/pkg/runtime/os_windows_amd64.c b/src/pkg/runtime/os_windows_amd64.c
index 7fb973c..a7acf1d 100644
--- a/src/pkg/runtime/os_windows_amd64.c
+++ b/src/pkg/runtime/os_windows_amd64.c
@@ -106,9 +106,9 @@
 
 
 	runtime·printf("PC=%X\n", r->Rip);
-	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
 		runtime·printf("signal arrived during cgo execution\n");
-		gp = m->lockedg;
+		gp = g->m->lockedg;
 	}
 	runtime·printf("\n");
 
diff --git a/src/pkg/runtime/panic.c b/src/pkg/runtime/panic.c
index f577b37..8225df7 100644
--- a/src/pkg/runtime/panic.c
+++ b/src/pkg/runtime/panic.c
@@ -34,7 +34,7 @@
 	d = nil;
 	sc = DEFERCLASS(siz);
 	if(sc < nelem(p->deferpool)) {
-		p = m->p;
+		p = g->m->p;
 		d = p->deferpool[sc];
 		if(d)
 			p->deferpool[sc] = d->link;
@@ -63,7 +63,7 @@
 		return;
 	sc = DEFERCLASS(d->siz);
 	if(sc < nelem(p->deferpool)) {
-		p = m->p;
+		p = g->m->p;
 		d->link = p->deferpool[sc];
 		p->deferpool[sc] = d;
 		// No need to wipe out pointers in argp/pc/fn/args,
@@ -134,13 +134,13 @@
 	// Do not allow preemption here, because the garbage collector
 	// won't know the form of the arguments until the jmpdefer can
 	// flip the PC over to fn.
-	m->locks++;
+	g->m->locks++;
 	runtime·memmove(argp, d->args, d->siz);
 	fn = d->fn;
 	g->defer = d->link;
 	freedefer(d);
-	m->locks--;
-	if(m->locks == 0 && g->preempt)
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)
 		g->stackguard0 = StackPreempt;
 	runtime·jmpdefer(fn, argp);
 }
@@ -385,12 +385,12 @@
 {
 	if(runtime·mheap.cachealloc.size == 0) { // very early
 		runtime·printf("runtime: panic before malloc heap initialized\n");
-		m->mallocing = 1; // tell rest of panic not to try to malloc
-	} else if(m->mcache == nil) // can happen if called from signal handler or throw
-		m->mcache = runtime·allocmcache();
-	switch(m->dying) {
+		g->m->mallocing = 1; // tell rest of panic not to try to malloc
+	} else if(g->m->mcache == nil) // can happen if called from signal handler or throw
+		g->m->mcache = runtime·allocmcache();
+	switch(g->m->dying) {
 	case 0:
-		m->dying = 1;
+		g->m->dying = 1;
 		if(g != nil)
 			g->writebuf = nil;
 		runtime·xadd(&runtime·panicking, 1);
@@ -402,14 +402,14 @@
 	case 1:
 		// Something failed while panicing, probably the print of the
 		// argument to panic().  Just print a stack trace and exit.
-		m->dying = 2;
+		g->m->dying = 2;
 		runtime·printf("panic during panic\n");
 		runtime·dopanic(0);
 		runtime·exit(3);
 	case 2:
 		// This is a genuine bug in the runtime, we couldn't even
 		// print the stack trace successfully.
-		m->dying = 3;
+		g->m->dying = 3;
 		runtime·printf("stack trace unavailable\n");
 		runtime·exit(4);
 	default:
@@ -430,11 +430,11 @@
 			g->sig, g->sigcode0, g->sigcode1, g->sigpc);
 
 	if((t = runtime·gotraceback(&crash)) > 0){
-		if(g != m->g0) {
+		if(g != g->m->g0) {
 			runtime·printf("\n");
 			runtime·goroutineheader(g);
 			runtime·traceback((uintptr)runtime·getcallerpc(&unused), (uintptr)runtime·getcallersp(&unused), 0, g);
-		} else if(t >= 2 || m->throwing > 0) {
+		} else if(t >= 2 || g->m->throwing > 0) {
 			runtime·printf("\nruntime stack:\n");
 			runtime·traceback((uintptr)runtime·getcallerpc(&unused), (uintptr)runtime·getcallersp(&unused), 0, g);
 		}
@@ -489,9 +489,12 @@
 bool
 runtime·canpanic(G *gp)
 {
-	byte g;
+	M *m;
 
-	USED(&g);  // don't use global g, it points to gsignal
+	// Note that g is m->gsignal, different from gp.
+	// Note also that g->m can change at preemption, so m can go stale
+	// if this function ever makes a function call.
+	m = g->m;
 
 	// Is it okay for gp to panic instead of crashing the program?
 	// Yes, as long as it is running Go code, not runtime code,
@@ -512,8 +515,8 @@
 void
 runtime·throw(int8 *s)
 {
-	if(m->throwing == 0)
-		m->throwing = 1;
+	if(g->m->throwing == 0)
+		g->m->throwing = 1;
 	runtime·startpanic();
 	runtime·printf("fatal error: %s\n", s);
 	runtime·dopanic(0);
@@ -531,20 +534,20 @@
 	// It increments m->locks to avoid preemption.
 	// If we're panicking, the software floating point frames
 	// will be unwound, so decrement m->locks as they would.
-	if(m->softfloat) {
-		m->locks--;
-		m->softfloat = 0;
+	if(g->m->softfloat) {
+		g->m->locks--;
+		g->m->softfloat = 0;
 	}
 
-	if(m->mallocing) {
+	if(g->m->mallocing) {
 		runtime·printf("panic: %s\n", s);
 		runtime·throw("panic during malloc");
 	}
-	if(m->gcing) {
+	if(g->m->gcing) {
 		runtime·printf("panic: %s\n", s);
 		runtime·throw("panic during gc");
 	}
-	if(m->locks) {
+	if(g->m->locks) {
 		runtime·printf("panic: %s\n", s);
 		runtime·throw("panic holding locks");
 	}
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index b812672..7467e9f 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -153,7 +153,7 @@
 
 	runtime·symtabinit();
 	runtime·mallocinit();
-	mcommoninit(m);
+	mcommoninit(g->m);
 	
 	// Initialize the itable value for newErrorCString,
 	// so that the next time it gets called, possibly
@@ -236,7 +236,7 @@
 	d.special = true;
 	g->defer = &d;
 
-	if(m != &runtime·m0)
+	if(g->m != &runtime·m0)
 		runtime·throw("runtime·main not on m0");
 	runtime·newproc1(&scavenger, nil, 0, 0, runtime·main);
 	main·init();
@@ -313,7 +313,7 @@
 	traceback = runtime·gotraceback(nil);
 	
 	// Show the current goroutine first, if we haven't already.
-	if((gp = m->curg) != nil && gp != me) {
+	if((gp = g->m->curg) != nil && gp != me) {
 		runtime·printf("\n");
 		runtime·goroutineheader(gp);
 		runtime·traceback(~(uintptr)0, ~(uintptr)0, 0, gp);
@@ -322,7 +322,7 @@
 	runtime·lock(&allglock);
 	for(i = 0; i < runtime·allglen; i++) {
 		gp = runtime·allg[i];
-		if(gp == me || gp == m->curg || gp->status == Gdead)
+		if(gp == me || gp == g->m->curg || gp->status == Gdead)
 			continue;
 		if(gp->issystem && traceback < 2)
 			continue;
@@ -352,7 +352,7 @@
 {
 	// If there is no mcache runtime·callers() will crash,
 	// and we are most likely in sysmon thread so the stack is senseless anyway.
-	if(m->mcache)
+	if(g->m->mcache)
 		runtime·callers(1, mp->createstack, nelem(mp->createstack));
 
 	mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
@@ -362,7 +362,7 @@
 	checkmcount();
 	runtime·mpreinit(mp);
 
-	// Add to runtime·allm so garbage collector doesn't free m
+	// Add to runtime·allm so garbage collector doesn't free g->m
 	// when it is just in a register or thread-local storage.
 	mp->alllink = runtime·allm;
 	// runtime·NumCgoCall() iterates over allm w/o schedlock,
@@ -376,17 +376,17 @@
 runtime·ready(G *gp)
 {
 	// Mark runnable.
-	m->locks++;  // disable preemption because it can be holding p in a local var
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
 	if(gp->status != Gwaiting) {
 		runtime·printf("goroutine %D has status %d\n", gp->goid, gp->status);
 		runtime·throw("bad g->status in ready");
 	}
 	gp->status = Grunnable;
-	runqput(m->p, gp);
+	runqput(g->m->p, gp);
 	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0)  // TODO: fast atomic
 		wakep();
-	m->locks--;
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 }
 
@@ -434,7 +434,7 @@
 	runtime·lock(&runtime·sched);
 	pos = 0;
 	for(n = 1; n < nproc; n++) {  // one M is currently running
-		if(runtime·allp[pos]->mcache == m->mcache)
+		if(runtime·allp[pos]->mcache == g->m->mcache)
 			pos++;
 		mp = mget();
 		if(mp == nil)
@@ -488,7 +488,7 @@
 	runtime·atomicstore((uint32*)&runtime·sched.gcwaiting, 1);
 	preemptall();
 	// stop current P
-	m->p->status = Pgcstop;
+	g->m->p->status = Pgcstop;
 	runtime·sched.stopwait--;
 	// try to retake all P's in Psyscall status
 	for(i = 0; i < runtime·gomaxprocs; i++) {
@@ -528,7 +528,7 @@
 static void
 mhelpgc(void)
 {
-	m->helpgc = -1;
+	g->m->helpgc = -1;
 }
 
 void
@@ -539,7 +539,7 @@
 	G *gp;
 	bool add;
 
-	m->locks++;  // disable preemption because it can be holding p in a local var
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
 	gp = runtime·netpoll(false);  // non-blocking
 	injectglist(gp);
 	add = needaddgcproc();
@@ -596,8 +596,8 @@
 		// the maximum number of procs.
 		newm(mhelpgc, nil);
 	}
-	m->locks--;
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 }
 
@@ -605,32 +605,32 @@
 void
 runtime·mstart(void)
 {
-	if(g != m->g0)
+	if(g != g->m->g0)
 		runtime·throw("bad runtime·mstart");
 
 	// Record top of stack for use by mcall.
 	// Once we call schedule we're never coming back,
 	// so other calls can reuse this stack space.
-	runtime·gosave(&m->g0->sched);
-	m->g0->sched.pc = (uintptr)-1;  // make sure it is never used
-	m->g0->stackguard = m->g0->stackguard0;  // cgo sets only stackguard0, copy it to stackguard
+	runtime·gosave(&g->m->g0->sched);
+	g->m->g0->sched.pc = (uintptr)-1;  // make sure it is never used
+	g->m->g0->stackguard = g->m->g0->stackguard0;  // cgo sets only stackguard0, copy it to stackguard
 	runtime·asminit();
 	runtime·minit();
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
-	if(m == &runtime·m0)
+	if(g->m == &runtime·m0)
 		runtime·initsig();
 	
-	if(m->mstartfn)
-		m->mstartfn();
+	if(g->m->mstartfn)
+		g->m->mstartfn();
 
-	if(m->helpgc) {
-		m->helpgc = 0;
+	if(g->m->helpgc) {
+		g->m->helpgc = 0;
 		stopm();
-	} else if(m != &runtime·m0) {
-		acquirep(m->nextp);
-		m->nextp = nil;
+	} else if(g->m != &runtime·m0) {
+		acquirep(g->m->nextp);
+		g->m->nextp = nil;
 	}
 	schedule();
 
@@ -647,7 +647,6 @@
 typedef struct CgoThreadStart CgoThreadStart;
 struct CgoThreadStart
 {
-	M *m;
 	G *g;
 	uintptr *tls;
 	void (*fn)(void);
@@ -661,8 +660,8 @@
 	M *mp;
 	static Type *mtype;  // The Go type M
 
-	m->locks++;  // disable GC because it can be called from sysmon
-	if(m->p == nil)
+	g->m->locks++;  // disable GC because it can be called from sysmon
+	if(g->m->p == nil)
 		acquirep(p);  // temporarily borrow p for mallocs in this function
 	if(mtype == nil) {
 		Eface e;
@@ -679,11 +678,12 @@
 		mp->g0 = runtime·malg(-1);
 	else
 		mp->g0 = runtime·malg(8192);
+	mp->g0->m = mp;
 
-	if(p == m->p)
+	if(p == g->m->p)
 		releasep();
-	m->locks--;
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 
 	return mp;
@@ -770,12 +770,12 @@
 	mp->needextram = mp->schedlink == nil;
 	unlockextra(mp->schedlink);
 
-	// Install m and g (= m->g0) and set the stack bounds
+	// Install g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
 	// how big the stack is, like we don't know how big any
 	// scheduling stack is, but we assume there's at least 32 kB,
 	// which is more than enough for us.
-	runtime·setmg(mp, mp->g0);
+	runtime·setg(mp->g0);
 	g->stackbase = (uintptr)(&x + 1024);
 	g->stackguard = (uintptr)(&x - 32*1024);
 	g->stackguard0 = g->stackguard;
@@ -810,6 +810,7 @@
 	gp->syscallstack = gp->stackbase;
 	gp->syscallguard = gp->stackguard;
 	gp->status = Gsyscall;
+	gp->m = mp;
 	mp->curg = gp;
 	mp->locked = LockInternal;
 	mp->lockedg = gp;
@@ -859,8 +860,8 @@
 
 	// Clear m and g, and return m to the extra list.
 	// After the call to setmg we can only call nosplit functions.
-	mp = m;
-	runtime·setmg(nil, nil);
+	mp = g->m;
+	runtime·setg(nil);
 
 	mnext = lockextra(true);
 	mp->schedlink = mnext;
@@ -925,7 +926,6 @@
 
 		if(_cgo_thread_start == nil)
 			runtime·throw("_cgo_thread_start missing");
-		ts.m = mp;
 		ts.g = mp->g0;
 		ts.tls = mp->tls;
 		ts.fn = runtime·mstart;
@@ -940,35 +940,35 @@
 static void
 stopm(void)
 {
-	if(m->locks)
+	if(g->m->locks)
 		runtime·throw("stopm holding locks");
-	if(m->p)
+	if(g->m->p)
 		runtime·throw("stopm holding p");
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		runtime·xadd(&runtime·sched.nmspinning, -1);
 	}
 
 retry:
 	runtime·lock(&runtime·sched);
-	mput(m);
+	mput(g->m);
 	runtime·unlock(&runtime·sched);
-	runtime·notesleep(&m->park);
-	runtime·noteclear(&m->park);
-	if(m->helpgc) {
+	runtime·notesleep(&g->m->park);
+	runtime·noteclear(&g->m->park);
+	if(g->m->helpgc) {
 		runtime·gchelper();
-		m->helpgc = 0;
-		m->mcache = nil;
+		g->m->helpgc = 0;
+		g->m->mcache = nil;
 		goto retry;
 	}
-	acquirep(m->nextp);
-	m->nextp = nil;
+	acquirep(g->m->nextp);
+	g->m->nextp = nil;
 }
 
 static void
 mspinning(void)
 {
-	m->spinning = true;
+	g->m->spinning = true;
 }
 
 // Schedules some M to run the p (creates an M if necessary).
@@ -1065,21 +1065,21 @@
 {
 	P *p;
 
-	if(m->lockedg == nil || m->lockedg->lockedm != m)
+	if(g->m->lockedg == nil || g->m->lockedg->lockedm != g->m)
 		runtime·throw("stoplockedm: inconsistent locking");
-	if(m->p) {
+	if(g->m->p) {
 		// Schedule another M to run this p.
 		p = releasep();
 		handoffp(p);
 	}
 	incidlelocked(1);
 	// Wait until another thread schedules lockedg again.
-	runtime·notesleep(&m->park);
-	runtime·noteclear(&m->park);
-	if(m->lockedg->status != Grunnable)
+	runtime·notesleep(&g->m->park);
+	runtime·noteclear(&g->m->park);
+	if(g->m->lockedg->status != Grunnable)
 		runtime·throw("stoplockedm: not runnable");
-	acquirep(m->nextp);
-	m->nextp = nil;
+	acquirep(g->m->nextp);
+	g->m->nextp = nil;
 }
 
 // Schedules the locked m to run the locked gp.
@@ -1090,7 +1090,7 @@
 	P *p;
 
 	mp = gp->lockedm;
-	if(mp == m)
+	if(mp == g->m)
 		runtime·throw("startlockedm: locked to me");
 	if(mp->nextp)
 		runtime·throw("startlockedm: m has p");
@@ -1111,8 +1111,8 @@
 
 	if(!runtime·sched.gcwaiting)
 		runtime·throw("gcstopm: not waiting for gc");
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		runtime·xadd(&runtime·sched.nmspinning, -1);
 	}
 	p = releasep();
@@ -1139,13 +1139,13 @@
 	gp->waitsince = 0;
 	gp->preempt = false;
 	gp->stackguard0 = gp->stackguard;
-	m->p->schedtick++;
-	m->curg = gp;
-	gp->m = m;
+	g->m->p->schedtick++;
+	g->m->curg = gp;
+	gp->m = g->m;
 
 	// Check whether the profiler needs to be turned on or off.
 	hz = runtime·sched.profilehz;
-	if(m->profilehz != hz)
+	if(g->m->profilehz != hz)
 		runtime·resetcpuprofiler(hz);
 
 	runtime·gogo(&gp->sched);
@@ -1168,13 +1168,13 @@
 	if(runtime·fingwait && runtime·fingwake && (gp = runtime·wakefing()) != nil)
 		runtime·ready(gp);
 	// local runq
-	gp = runqget(m->p);
+	gp = runqget(g->m->p);
 	if(gp)
 		return gp;
 	// global runq
 	if(runtime·sched.runqsize) {
 		runtime·lock(&runtime·sched);
-		gp = globrunqget(m->p, 0);
+		gp = globrunqget(g->m->p, 0);
 		runtime·unlock(&runtime·sched);
 		if(gp)
 			return gp;
@@ -1189,10 +1189,10 @@
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if(!m->spinning && 2 * runtime·atomicload(&runtime·sched.nmspinning) >= runtime·gomaxprocs - runtime·atomicload(&runtime·sched.npidle))  // TODO: fast atomic
+	if(!g->m->spinning && 2 * runtime·atomicload(&runtime·sched.nmspinning) >= runtime·gomaxprocs - runtime·atomicload(&runtime·sched.npidle))  // TODO: fast atomic
 		goto stop;
-	if(!m->spinning) {
-		m->spinning = true;
+	if(!g->m->spinning) {
+		g->m->spinning = true;
 		runtime·xadd(&runtime·sched.nmspinning, 1);
 	}
 	// random steal from other P's
@@ -1200,10 +1200,10 @@
 		if(runtime·sched.gcwaiting)
 			goto top;
 		p = runtime·allp[runtime·fastrand1()%runtime·gomaxprocs];
-		if(p == m->p)
+		if(p == g->m->p)
 			gp = runqget(p);
 		else
-			gp = runqsteal(m->p, p);
+			gp = runqsteal(g->m->p, p);
 		if(gp)
 			return gp;
 	}
@@ -1215,15 +1215,15 @@
 		goto top;
 	}
 	if(runtime·sched.runqsize) {
-		gp = globrunqget(m->p, 0);
+		gp = globrunqget(g->m->p, 0);
 		runtime·unlock(&runtime·sched);
 		return gp;
 	}
 	p = releasep();
 	pidleput(p);
 	runtime·unlock(&runtime·sched);
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		runtime·xadd(&runtime·sched.nmspinning, -1);
 	}
 	// check all runqueues once again
@@ -1242,9 +1242,9 @@
 	}
 	// poll network
 	if(runtime·xchg64(&runtime·sched.lastpoll, 0) != 0) {
-		if(m->p)
+		if(g->m->p)
 			runtime·throw("findrunnable: netpoll with p");
-		if(m->spinning)
+		if(g->m->spinning)
 			runtime·throw("findrunnable: netpoll with spinning");
 		gp = runtime·netpoll(true);  // block until new work is available
 		runtime·atomicstore64(&runtime·sched.lastpoll, runtime·nanotime());
@@ -1270,8 +1270,8 @@
 {
 	int32 nmspinning;
 
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		nmspinning = runtime·xadd(&runtime·sched.nmspinning, -1);
 		if(nmspinning < 0)
 			runtime·throw("findrunnable: negative nmspinning");
@@ -1315,7 +1315,7 @@
 	G *gp;
 	uint32 tick;
 
-	if(m->locks)
+	if(g->m->locks)
 		runtime·throw("schedule: holding locks");
 
 top:
@@ -1328,19 +1328,19 @@
 	// Check the global runnable queue once in a while to ensure fairness.
 	// Otherwise two goroutines can completely occupy the local runqueue
 	// by constantly respawning each other.
-	tick = m->p->schedtick;
+	tick = g->m->p->schedtick;
 	// This is a fancy way to say tick%61==0,
 	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
 	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime·sched.runqsize > 0) {
 		runtime·lock(&runtime·sched);
-		gp = globrunqget(m->p, 1);
+		gp = globrunqget(g->m->p, 1);
 		runtime·unlock(&runtime·sched);
 		if(gp)
 			resetspinning();
 	}
 	if(gp == nil) {
-		gp = runqget(m->p);
-		if(gp && m->spinning)
+		gp = runqget(g->m->p);
+		if(gp && g->m->spinning)
 			runtime·throw("schedule: spinning with local work");
 	}
 	if(gp == nil) {
@@ -1365,8 +1365,8 @@
 {
 	if(g->status != Grunning)
 		runtime·throw("bad g status");
-	m->waitlock = lock;
-	m->waitunlockf = unlockf;
+	g->m->waitlock = lock;
+	g->m->waitunlockf = unlockf;
 	g->waitreason = reason;
 	runtime·mcall(park0);
 }
@@ -1395,17 +1395,17 @@
 
 	gp->status = Gwaiting;
 	gp->m = nil;
-	m->curg = nil;
-	if(m->waitunlockf) {
-		ok = m->waitunlockf(gp, m->waitlock);
-		m->waitunlockf = nil;
-		m->waitlock = nil;
+	g->m->curg = nil;
+	if(g->m->waitunlockf) {
+		ok = g->m->waitunlockf(gp, g->m->waitlock);
+		g->m->waitunlockf = nil;
+		g->m->waitlock = nil;
 		if(!ok) {
 			gp->status = Grunnable;
 			execute(gp);  // Schedule it back, never returns.
 		}
 	}
-	if(m->lockedg) {
+	if(g->m->lockedg) {
 		stoplockedm();
 		execute(gp);  // Never returns.
 	}
@@ -1427,11 +1427,11 @@
 {
 	gp->status = Grunnable;
 	gp->m = nil;
-	m->curg = nil;
+	g->m->curg = nil;
 	runtime·lock(&runtime·sched);
 	globrunqput(gp);
 	runtime·unlock(&runtime·sched);
-	if(m->lockedg) {
+	if(g->m->lockedg) {
 		stoplockedm();
 		execute(gp);  // Never returns.
 	}
@@ -1467,15 +1467,15 @@
 	gp->writebuf = nil;
 	gp->waitreason = nil;
 	gp->param = nil;
-	m->curg = nil;
-	m->lockedg = nil;
-	if(m->locked & ~LockExternal) {
-		runtime·printf("invalid m->locked = %d\n", m->locked);
+	g->m->curg = nil;
+	g->m->lockedg = nil;
+	if(g->m->locked & ~LockExternal) {
+		runtime·printf("invalid m->locked = %d\n", g->m->locked);
 		runtime·throw("internal lockOSThread error");
 	}	
-	m->locked = 0;
+	g->m->locked = 0;
 	runtime·unwindstack(gp, nil);
-	gfput(m->p, gp);
+	gfput(g->m->p, gp);
 	schedule();
 }
 
@@ -1505,7 +1505,7 @@
 {
 	// Disable preemption because during this function g is in Gsyscall status,
 	// but can have inconsistent g->sched, do not let GC observe it.
-	m->locks++;
+	g->m->locks++;
 
 	// Leave SP around for GC and traceback.
 	save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
@@ -1530,12 +1530,12 @@
 		save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
 	}
 
-	m->mcache = nil;
-	m->p->m = nil;
-	runtime·atomicstore(&m->p->status, Psyscall);
+	g->m->mcache = nil;
+	g->m->p->m = nil;
+	runtime·atomicstore(&g->m->p->status, Psyscall);
 	if(runtime·sched.gcwaiting) {
 		runtime·lock(&runtime·sched);
-		if (runtime·sched.stopwait > 0 && runtime·cas(&m->p->status, Psyscall, Pgcstop)) {
+		if (runtime·sched.stopwait > 0 && runtime·cas(&g->m->p->status, Psyscall, Pgcstop)) {
 			if(--runtime·sched.stopwait == 0)
 				runtime·notewakeup(&runtime·sched.stopnote);
 		}
@@ -1547,7 +1547,7 @@
 	// We set stackguard to StackPreempt so that first split stack check calls morestack.
 	// Morestack detects this case and throws.
 	g->stackguard0 = StackPreempt;
-	m->locks--;
+	g->m->locks--;
 }
 
 // The same as runtime·entersyscall(), but with a hint that the syscall is blocking.
@@ -1557,7 +1557,7 @@
 {
 	P *p;
 
-	m->locks++;  // see comment in entersyscall
+	g->m->locks++;  // see comment in entersyscall
 
 	// Leave SP around for GC and traceback.
 	save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
@@ -1581,7 +1581,7 @@
 	save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
 
 	g->stackguard0 = StackPreempt;  // see comment in entersyscall
-	m->locks--;
+	g->m->locks--;
 }
 
 // The goroutine g exited its system call.
@@ -1592,7 +1592,7 @@
 void
 runtime·exitsyscall(void)
 {
-	m->locks++;  // see comment in entersyscall
+	g->m->locks++;  // see comment in entersyscall
 
 	if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
 		incidlelocked(-1);
@@ -1600,13 +1600,13 @@
 	g->waitsince = 0;
 	if(exitsyscallfast()) {
 		// There's a cpu for us, so we can run.
-		m->p->syscalltick++;
+		g->m->p->syscalltick++;
 		g->status = Grunning;
 		// Garbage collector isn't running (since we are),
 		// so okay to clear gcstack and gcsp.
 		g->syscallstack = (uintptr)nil;
 		g->syscallsp = (uintptr)nil;
-		m->locks--;
+		g->m->locks--;
 		if(g->preempt) {
 			// restore the preemption request in case we've cleared it in newstack
 			g->stackguard0 = StackPreempt;
@@ -1617,7 +1617,7 @@
 		return;
 	}
 
-	m->locks--;
+	g->m->locks--;
 
 	// Call the scheduler.
 	runtime·mcall(exitsyscall0);
@@ -1630,7 +1630,7 @@
 	// is not running.
 	g->syscallstack = (uintptr)nil;
 	g->syscallsp = (uintptr)nil;
-	m->p->syscalltick++;
+	g->m->p->syscalltick++;
 }
 
 #pragma textflag NOSPLIT
@@ -1641,19 +1641,19 @@
 
 	// Freezetheworld sets stopwait but does not retake P's.
 	if(runtime·sched.stopwait) {
-		m->p = nil;
+		g->m->p = nil;
 		return false;
 	}
 
 	// Try to re-acquire the last P.
-	if(m->p && m->p->status == Psyscall && runtime·cas(&m->p->status, Psyscall, Prunning)) {
+	if(g->m->p && g->m->p->status == Psyscall && runtime·cas(&g->m->p->status, Psyscall, Prunning)) {
 		// There's a cpu for us, so we can run.
-		m->mcache = m->p->mcache;
-		m->p->m = m;
+		g->m->mcache = g->m->p->mcache;
+		g->m->p->m = g->m;
 		return true;
 	}
 	// Try to get any other idle P.
-	m->p = nil;
+	g->m->p = nil;
 	if(runtime·sched.pidle) {
 		runtime·lock(&runtime·sched);
 		p = pidleget();
@@ -1679,7 +1679,7 @@
 
 	gp->status = Grunnable;
 	gp->m = nil;
-	m->curg = nil;
+	g->m->curg = nil;
 	runtime·lock(&runtime·sched);
 	p = pidleget();
 	if(p == nil)
@@ -1693,7 +1693,7 @@
 		acquirep(p);
 		execute(gp);  // Never returns.
 	}
-	if(m->lockedg) {
+	if(g->m->lockedg) {
 		// Wait until another thread schedules gp and so m again.
 		stoplockedm();
 		execute(gp);  // Never returns.
@@ -1709,15 +1709,15 @@
 {
 	// Fork can hang if preempted with signals frequently enough (see issue 5517).
 	// Ensure that we stay on the same M where we disable profiling.
-	m->locks++;
-	if(m->profilehz != 0)
+	g->m->locks++;
+	if(g->m->profilehz != 0)
 		runtime·resetcpuprofiler(0);
 
 	// This function is called before fork in syscall package.
 	// Code between fork and exec must not allocate memory nor even try to grow stack.
 	// Here we spoil g->stackguard to reliably detect any attempts to grow stack.
 	// runtime_AfterFork will undo this in parent process, but not in child.
-	m->forkstackguard = g->stackguard;
+	g->m->forkstackguard = g->stackguard;
 	g->stackguard0 = StackPreempt-1;
 	g->stackguard = StackPreempt-1;
 }
@@ -1730,14 +1730,14 @@
 	int32 hz;
 
 	// See the comment in runtime_BeforeFork.
-	g->stackguard0 = m->forkstackguard;
-	g->stackguard = m->forkstackguard;
-	m->forkstackguard = 0;
+	g->stackguard0 = g->m->forkstackguard;
+	g->stackguard = g->m->forkstackguard;
+	g->m->forkstackguard = 0;
 
 	hz = runtime·sched.profilehz;
 	if(hz != 0)
 		runtime·resetcpuprofiler(hz);
-	m->locks--;
+	g->m->locks--;
 }
 
 // Hook used by runtime·malg to call runtime·stackalloc on the
@@ -1772,7 +1772,7 @@
 	newg = allocg();
 	if(stacksize >= 0) {
 		stacksize = runtime·round2(StackSystem + stacksize);
-		if(g == m->g0) {
+		if(g == g->m->g0) {
 			// running on scheduler stack already.
 			stk = runtime·stackalloc(newg, stacksize);
 		} else {
@@ -1825,10 +1825,10 @@
 
 //runtime·printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
 	if(fn == nil) {
-		m->throwing = -1;  // do not dump full stacks
+		g->m->throwing = -1;  // do not dump full stacks
 		runtime·throw("go of nil func value");
 	}
-	m->locks++;  // disable preemption because it can be holding p in a local var
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
 	siz = narg + nret;
 	siz = (siz+7) & ~7;
 
@@ -1839,7 +1839,7 @@
 	if(siz > StackMin - 1024)
 		runtime·throw("runtime.newproc: function arguments too large for new goroutine");
 
-	p = m->p;
+	p = g->m->p;
 	if((newg = gfget(p)) != nil) {
 		if(newg->stackguard - StackGuard != newg->stack0)
 			runtime·throw("invalid stack in newg");
@@ -1876,8 +1876,8 @@
 
 	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0 && fn->fn != runtime·main)  // TODO: fast atomic
 		wakep();
-	m->locks--;
-	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 	return newg;
 }
@@ -1976,7 +1976,7 @@
 
 		if(gp->stack0 == 0) {
 			// Stack was deallocated in gfput.  Allocate a new one.
-			if(g == m->g0) {
+			if(g == g->m->g0) {
 				stk = runtime·stackalloc(gp, FixedStack);
 			} else {
 				gp->stacksize = FixedStack;
@@ -2041,10 +2041,10 @@
 	runtime·unlock(&runtime·sched);
 
 	runtime·semacquire(&runtime·worldsema, false);
-	m->gcing = 1;
+	g->m->gcing = 1;
 	runtime·stoptheworld();
 	newprocs = n;
-	m->gcing = 0;
+	g->m->gcing = 0;
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();
 
@@ -2058,21 +2058,21 @@
 static void
 lockOSThread(void)
 {
-	m->lockedg = g;
-	g->lockedm = m;
+	g->m->lockedg = g;
+	g->lockedm = g->m;
 }
 
 void
 runtime·LockOSThread(void)
 {
-	m->locked |= LockExternal;
+	g->m->locked |= LockExternal;
 	lockOSThread();
 }
 
 void
 runtime·lockOSThread(void)
 {
-	m->locked += LockInternal;
+	g->m->locked += LockInternal;
 	lockOSThread();
 }
 
@@ -2084,32 +2084,32 @@
 static void
 unlockOSThread(void)
 {
-	if(m->locked != 0)
+	if(g->m->locked != 0)
 		return;
-	m->lockedg = nil;
+	g->m->lockedg = nil;
 	g->lockedm = nil;
 }
 
 void
 runtime·UnlockOSThread(void)
 {
-	m->locked &= ~LockExternal;
+	g->m->locked &= ~LockExternal;
 	unlockOSThread();
 }
 
 void
 runtime·unlockOSThread(void)
 {
-	if(m->locked < LockInternal)
+	if(g->m->locked < LockInternal)
 		runtime·throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
-	m->locked -= LockInternal;
+	g->m->locked -= LockInternal;
 	unlockOSThread();
 }
 
 bool
 runtime·lockedOSThread(void)
 {
-	return g->lockedm != nil && m->lockedg != nil;
+	return g->lockedm != nil && g->m->lockedg != nil;
 }
 
 int32
@@ -2329,7 +2329,7 @@
 
 	// Disable preemption, otherwise we can be rescheduled to another thread
 	// that has profiling enabled.
-	m->locks++;
+	g->m->locks++;
 
 	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
@@ -2347,7 +2347,7 @@
 	if(hz != 0)
 		runtime·resetcpuprofiler(hz);
 
-	m->locks--;
+	g->m->locks--;
 }
 
 // Change number of processors.  The world is stopped, sched is locked.
@@ -2373,7 +2373,7 @@
 		}
 		if(p->mcache == nil) {
 			if(old==0 && i==0)
-				p->mcache = m->mcache;  // bootstrap
+				p->mcache = g->m->mcache;  // bootstrap
 			else
 				p->mcache = runtime·allocmcache();
 		}
@@ -2424,10 +2424,10 @@
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
-	if(m->p)
-		m->p->m = nil;
-	m->p = nil;
-	m->mcache = nil;
+	if(g->m->p)
+		g->m->p->m = nil;
+	g->m->p = nil;
+	g->m->mcache = nil;
 	p = runtime·allp[0];
 	p->m = nil;
 	p->status = Pidle;
@@ -2444,15 +2444,15 @@
 static void
 acquirep(P *p)
 {
-	if(m->p || m->mcache)
+	if(g->m->p || g->m->mcache)
 		runtime·throw("acquirep: already in go");
 	if(p->m || p->status != Pidle) {
 		runtime·printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
 		runtime·throw("acquirep: invalid p state");
 	}
-	m->mcache = p->mcache;
-	m->p = p;
-	p->m = m;
+	g->m->mcache = p->mcache;
+	g->m->p = p;
+	p->m = g->m;
 	p->status = Prunning;
 }
 
@@ -2462,16 +2462,16 @@
 {
 	P *p;
 
-	if(m->p == nil || m->mcache == nil)
+	if(g->m->p == nil || g->m->mcache == nil)
 		runtime·throw("releasep: invalid arg");
-	p = m->p;
-	if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
+	p = g->m->p;
+	if(p->m != g->m || p->mcache != g->m->mcache || p->status != Prunning) {
 		runtime·printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
-			m, m->p, p->m, m->mcache, p->mcache, p->status);
+			g->m, g->m->p, p->m, g->m->mcache, p->mcache, p->status);
 		runtime·throw("releasep: invalid p state");
 	}
-	m->p = nil;
-	m->mcache = nil;
+	g->m->p = nil;
+	g->m->mcache = nil;
 	p->m = nil;
 	p->status = Pidle;
 	return p;
@@ -2529,7 +2529,7 @@
 	runtime·unlock(&allglock);
 	if(grunning == 0)  // possible if main goroutine calls runtime·Goexit()
 		runtime·throw("no goroutines (main called runtime.Goexit) - deadlock!");
-	m->throwing = -1;  // do not dump full stacks
+	g->m->throwing = -1;  // do not dump full stacks
 	runtime·throw("all goroutines are asleep - deadlock!");
 }
 
@@ -2700,7 +2700,7 @@
 	G *gp;
 
 	mp = p->m;
-	if(mp == nil || mp == m)
+	if(mp == nil || mp == g->m)
 		return false;
 	gp = mp->curg;
 	if(gp == nil || gp == mp->g0)
@@ -2783,7 +2783,7 @@
 			" locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
 			mp->id, id1, id2,
 			mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
-			mp->spinning, m->blocked, id3);
+			mp->spinning, g->m->blocked, id3);
 	}
 	runtime·lock(&allglock);
 	for(gi = 0; gi < runtime·allglen; gi++) {
diff --git a/src/pkg/runtime/race_amd64.s b/src/pkg/runtime/race_amd64.s
index d60cf89..210f5d4 100644
--- a/src/pkg/runtime/race_amd64.s
+++ b/src/pkg/runtime/race_amd64.s
@@ -192,8 +192,8 @@
 // Switches SP to g0 stack and calls (AX). Arguments already set.
 TEXT	racecall<>(SB), NOSPLIT, $0-0
 	get_tls(R12)
-	MOVQ	m(R12), R13
 	MOVQ	g(R12), R14
+	MOVQ	g_m(R14), R13
 	// Switch to g0 stack.
 	MOVQ	SP, R12		// callee-saved, preserved across the CALL
 	MOVQ	m_g0(R13), R10
@@ -222,14 +222,16 @@
 	PUSHQ	R15
 	// Set g = g0.
 	get_tls(R12)
-	MOVQ	m(R12), R13
+	MOVQ	g(R12), R13
+	MOVQ	g_m(R13), R13
 	MOVQ	m_g0(R13), R14
 	MOVQ	R14, g(R12)	// g = m->g0
 	MOVQ	RARG0, 0(SP)	// func arg
 	CALL	runtime·racesymbolize(SB)
 	// All registers are smashed after Go code, reload.
 	get_tls(R12)
-	MOVQ	m(R12), R13
+	MOVQ	g(R12), R13
+	MOVQ	g_m(R13), R13
 	MOVQ	m_curg(R13), R14
 	MOVQ	R14, g(R12)	// g = m->curg
 	// Restore callee-saved registers.
diff --git a/src/pkg/runtime/runtime.c b/src/pkg/runtime/runtime.c
index 3a4f719..26dbbbd3 100644
--- a/src/pkg/runtime/runtime.c
+++ b/src/pkg/runtime/runtime.c
@@ -32,8 +32,8 @@
 
 	if(crash != nil)
 		*crash = false;
-	if(m->traceback != 0)
-		return m->traceback;
+	if(g->m->traceback != 0)
+		return g->m->traceback;
 	x = runtime·atomicload(&traceback_cache);
 	if(x == ~(uint32)0) {
 		p = runtime·getenv("GOTRACEBACK");
@@ -286,11 +286,11 @@
 {
 	uint32 x;
 
-	x = m->fastrand;
+	x = g->m->fastrand;
 	x += x;
 	if(x & 0x80000000L)
 		x ^= 0x88888eefUL;
-	m->fastrand = x;
+	g->m->fastrand = x;
 	return x;
 }
 
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index 5115503..0f630ab 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -102,14 +102,13 @@
  * local storage indexed by a pseudo-register TLS. See zasmhdr in
  * src/cmd/dist/buildruntime.c for details, and be aware that the linker may
  * make further OS-specific changes to the compiler's output. For example,
- * 6l/linux rewrites 0(TLS) as -16(FS).
+ * 6l/linux rewrites 0(TLS) as -8(FS).
  *
  * Every C file linked into a Go program must include runtime.h so that the
  * C compiler (6c, 8c, etc.) knows to avoid other uses of these dedicated
  * registers. The Go compiler (6g, 8g, etc.) knows to avoid them.
  */
 extern	register	G*	g;
-extern	register	M*	m;
 
 /*
  * defined constants
@@ -907,7 +906,7 @@
 void*	runtime·atomicloadp(void* volatile*);
 void	runtime·atomicstorep(void* volatile*, void*);
 
-void	runtime·setmg(M*, G*);
+void	runtime·setg(G*);
 void	runtime·newextram(void);
 void	runtime·exit(int32);
 void	runtime·breakpoint(void);
diff --git a/src/pkg/runtime/runtime1.goc b/src/pkg/runtime/runtime1.goc
index c6f6b62..a95a4f9 100644
--- a/src/pkg/runtime/runtime1.goc
+++ b/src/pkg/runtime/runtime1.goc
@@ -117,12 +117,12 @@
 func sync·runtime_procPin() (p int) {
 	M *mp;
 
-	mp = m;
+	mp = g->m;
 	// Disable preemption.
 	mp->locks++;
 	p = mp->p->id;
 }
 
 func sync·runtime_procUnpin() {
-	m->locks--;
+	g->m->locks--;
 }
diff --git a/src/pkg/runtime/signal_386.c b/src/pkg/runtime/signal_386.c
index 70fcc6a..d55e304 100644
--- a/src/pkg/runtime/signal_386.c
+++ b/src/pkg/runtime/signal_386.c
@@ -39,7 +39,7 @@
 	bool crash;
 
 	if(sig == SIGPROF) {
-		runtime·sigprof((byte*)SIG_EIP(info, ctxt), (byte*)SIG_ESP(info, ctxt), nil, gp, m);
+		runtime·sigprof((byte*)SIG_EIP(info, ctxt), (byte*)SIG_ESP(info, ctxt), nil, gp, g->m);
 		return;
 	}
 
@@ -91,8 +91,8 @@
 	if(!(t->flags & SigThrow))
 		return;
 
-	m->throwing = 1;
-	m->caughtsig = gp;
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
 	runtime·startpanic();
 
 	if(sig < 0 || sig >= NSIG)
@@ -101,10 +101,10 @@
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%x\n", SIG_EIP(info, ctxt));
-	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
 		runtime·printf("signal arrived during cgo execution\n");
-		gp = m->lockedg;
-	}	
+		gp = g->m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback(&crash)){
diff --git a/src/pkg/runtime/signal_amd64x.c b/src/pkg/runtime/signal_amd64x.c
index 04026f3..44e68ce 100644
--- a/src/pkg/runtime/signal_amd64x.c
+++ b/src/pkg/runtime/signal_amd64x.c
@@ -48,7 +48,7 @@
 	bool crash;
 
 	if(sig == SIGPROF) {
-		runtime·sigprof((byte*)SIG_RIP(info, ctxt), (byte*)SIG_RSP(info, ctxt), nil, gp, m);
+		runtime·sigprof((byte*)SIG_RIP(info, ctxt), (byte*)SIG_RSP(info, ctxt), nil, gp, g->m);
 		return;
 	}
 
@@ -125,8 +125,8 @@
 	if(!(t->flags & SigThrow))
 		return;
 
-	m->throwing = 1;
-	m->caughtsig = gp;
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
 	runtime·startpanic();
 
 	if(sig < 0 || sig >= NSIG)
@@ -135,9 +135,9 @@
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", SIG_RIP(info, ctxt));
-	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
 		runtime·printf("signal arrived during cgo execution\n");
-		gp = m->lockedg;
+		gp = g->m->lockedg;
 	}
 	runtime·printf("\n");
 
diff --git a/src/pkg/runtime/signal_arm.c b/src/pkg/runtime/signal_arm.c
index 9b2a43d..1e86368 100644
--- a/src/pkg/runtime/signal_arm.c
+++ b/src/pkg/runtime/signal_arm.c
@@ -46,7 +46,7 @@
 	bool crash;
 
 	if(sig == SIGPROF) {
-		runtime·sigprof((uint8*)SIG_PC(info, ctxt), (uint8*)SIG_SP(info, ctxt), (uint8*)SIG_LR(info, ctxt), gp, m);
+		runtime·sigprof((uint8*)SIG_PC(info, ctxt), (uint8*)SIG_SP(info, ctxt), (uint8*)SIG_LR(info, ctxt), gp, g->m);
 		return;
 	}
 
@@ -76,7 +76,7 @@
 			SIG_LR(info, ctxt) = gp->sigpc;
 		// In case we are panicking from external C code
 		SIG_R10(info, ctxt) = (uintptr)gp;
-		SIG_R9(info, ctxt) = (uintptr)m;
+		SIG_R9(info, ctxt) = (uintptr)g->m;
 		SIG_PC(info, ctxt) = (uintptr)runtime·sigpanic;
 		return;
 	}
@@ -89,8 +89,8 @@
 	if(!(t->flags & SigThrow))
 		return;
 
-	m->throwing = 1;
-	m->caughtsig = gp;
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
 	if(runtime·panicking)	// traceback already printed
 		runtime·exit(2);
 	runtime·panicking = 1;
@@ -101,9 +101,9 @@
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%x\n", SIG_PC(info, ctxt));
-	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
 		runtime·printf("signal arrived during cgo execution\n");
-		gp = m->lockedg;
+		gp = g->m->lockedg;
 	}
 	runtime·printf("\n");
 
diff --git a/src/pkg/runtime/signal_unix.c b/src/pkg/runtime/signal_unix.c
index 246a1eb..4d582d3 100644
--- a/src/pkg/runtime/signal_unix.c
+++ b/src/pkg/runtime/signal_unix.c
@@ -89,7 +89,7 @@
 		it.it_value = it.it_interval;
 		runtime·setitimer(ITIMER_PROF, &it, nil);
 	}
-	m->profilehz = hz;
+	g->m->profilehz = hz;
 }
 
 void
diff --git a/src/pkg/runtime/softfloat_arm.c b/src/pkg/runtime/softfloat_arm.c
index 29a52bd..41ce8bd 100644
--- a/src/pkg/runtime/softfloat_arm.c
+++ b/src/pkg/runtime/softfloat_arm.c
@@ -32,20 +32,20 @@
 static void
 putf(uint32 reg, uint32 val)
 {
-	m->freglo[reg] = val;
+	g->m->freglo[reg] = val;
 }
 
 static void
 putd(uint32 reg, uint64 val)
 {
-	m->freglo[reg] = (uint32)val;
-	m->freghi[reg] = (uint32)(val>>32);
+	g->m->freglo[reg] = (uint32)val;
+	g->m->freghi[reg] = (uint32)(val>>32);
 }
 
 static uint64
 getd(uint32 reg)
 {
-	return (uint64)m->freglo[reg] | ((uint64)m->freghi[reg]<<32);
+	return (uint64)g->m->freglo[reg] | ((uint64)g->m->freghi[reg]<<32);
 }
 
 static void
@@ -53,7 +53,7 @@
 {
 	uint32 i;
 	for (i = 0; i < 16; i++) {
-		runtime·printf("\tf%d:\t%X %X\n", i, m->freghi[i], m->freglo[i]);
+		runtime·printf("\tf%d:\t%X %X\n", i, g->m->freghi[i], g->m->freglo[i]);
 	}
 }
 
@@ -111,7 +111,11 @@
 	int64 sval;
 	bool nan, ok;
 	int32 cmp;
+	M *m;
 
+	// m is locked in vlop_arm.s, so g->m cannot change during this function call,
+	// so caching it in a local variable is safe.
+	m = g->m;
 	i = *pc;
 
 	if(trace)
diff --git a/src/pkg/runtime/stack.c b/src/pkg/runtime/stack.c
index 1680f00..a070421 100644
--- a/src/pkg/runtime/stack.c
+++ b/src/pkg/runtime/stack.c
@@ -53,15 +53,15 @@
 		for(i = 0; i < StackCacheBatch-1; i++)
 			n->batch[i] = (byte*)n + (i+1)*FixedStack;
 	}
-	pos = m->stackcachepos;
+	pos = g->m->stackcachepos;
 	for(i = 0; i < StackCacheBatch-1; i++) {
-		m->stackcache[pos] = n->batch[i];
+		g->m->stackcache[pos] = n->batch[i];
 		pos = (pos + 1) % StackCacheSize;
 	}
-	m->stackcache[pos] = n;
+	g->m->stackcache[pos] = n;
 	pos = (pos + 1) % StackCacheSize;
-	m->stackcachepos = pos;
-	m->stackcachecnt += StackCacheBatch;
+	g->m->stackcachepos = pos;
+	g->m->stackcachecnt += StackCacheBatch;
 }
 
 static void
@@ -70,14 +70,14 @@
 	StackCacheNode *n;
 	uint32 i, pos;
 
-	pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
-	n = (StackCacheNode*)m->stackcache[pos];
+	pos = (g->m->stackcachepos - g->m->stackcachecnt) % StackCacheSize;
+	n = (StackCacheNode*)g->m->stackcache[pos];
 	pos = (pos + 1) % StackCacheSize;
 	for(i = 0; i < StackCacheBatch-1; i++) {
-		n->batch[i] = m->stackcache[pos];
+		n->batch[i] = g->m->stackcache[pos];
 		pos = (pos + 1) % StackCacheSize;
 	}
-	m->stackcachecnt -= StackCacheBatch;
+	g->m->stackcachecnt -= StackCacheBatch;
 	runtime·lock(&stackcachemu);
 	n->next = stackcache;
 	stackcache = n;
@@ -95,7 +95,7 @@
 	// Stackalloc must be called on scheduler stack, so that we
 	// never try to grow the stack during the code that stackalloc runs.
 	// Doing so would cause a deadlock (issue 1547).
-	if(g != m->g0)
+	if(g != g->m->g0)
 		runtime·throw("stackalloc not on scheduler stack");
 	if((n & (n-1)) != 0)
 		runtime·throw("stack size not a power of 2");
@@ -115,19 +115,19 @@
 	// (assuming that inside malloc all the stack frames are small,
 	// so that we do not deadlock).
 	malloced = true;
-	if(n == FixedStack || m->mallocing) {
+	if(n == FixedStack || g->m->mallocing) {
 		if(n != FixedStack) {
 			runtime·printf("stackalloc: in malloc, size=%d want %d\n", FixedStack, n);
 			runtime·throw("stackalloc");
 		}
-		if(m->stackcachecnt == 0)
+		if(g->m->stackcachecnt == 0)
 			stackcacherefill();
-		pos = m->stackcachepos;
+		pos = g->m->stackcachepos;
 		pos = (pos - 1) % StackCacheSize;
-		v = m->stackcache[pos];
-		m->stackcachepos = pos;
-		m->stackcachecnt--;
-		m->stackinuse++;
+		v = g->m->stackcache[pos];
+		g->m->stackcachepos = pos;
+		g->m->stackcachecnt--;
+		g->m->stackinuse++;
 		malloced = false;
 	} else
 		v = runtime·mallocgc(n, 0, FlagNoProfiling|FlagNoGC|FlagNoZero|FlagNoInvokeGC);
@@ -161,13 +161,13 @@
 	}
 	if(n != FixedStack)
 		runtime·throw("stackfree: bad fixed size");
-	if(m->stackcachecnt == StackCacheSize)
+	if(g->m->stackcachecnt == StackCacheSize)
 		stackcacherelease();
-	pos = m->stackcachepos;
-	m->stackcache[pos] = v;
-	m->stackcachepos = (pos + 1) % StackCacheSize;
-	m->stackcachecnt++;
-	m->stackinuse--;
+	pos = g->m->stackcachepos;
+	g->m->stackcache[pos] = v;
+	g->m->stackcachepos = (pos + 1) % StackCacheSize;
+	g->m->stackcachecnt++;
+	g->m->stackinuse--;
 }
 
 // Called from runtime·lessstack when returning from a function which
@@ -184,7 +184,7 @@
 	int64 goid;
 	int32 oldstatus;
 
-	gp = m->curg;
+	gp = g->m->curg;
 	top = (Stktop*)gp->stackbase;
 	old = (byte*)gp->stackguard - StackGuard;
 	sp = (byte*)top;
@@ -192,7 +192,7 @@
 
 	if(StackDebug >= 1) {
 		runtime·printf("runtime: oldstack gobuf={pc:%p sp:%p lr:%p} cret=%p argsize=%p\n",
-			top->gobuf.pc, top->gobuf.sp, top->gobuf.lr, (uintptr)m->cret, (uintptr)argsize);
+			top->gobuf.pc, top->gobuf.sp, top->gobuf.lr, (uintptr)g->m->cret, (uintptr)argsize);
 	}
 
 	// gp->status is usually Grunning, but it could be Gsyscall if a stack overflow
@@ -200,8 +200,8 @@
 	oldstatus = gp->status;
 	
 	gp->sched = top->gobuf;
-	gp->sched.ret = m->cret;
-	m->cret = 0; // drop reference
+	gp->sched.ret = g->m->cret;
+	g->m->cret = 0; // drop reference
 	gp->status = Gwaiting;
 	gp->waitreason = "stack unsplit";
 
@@ -416,7 +416,7 @@
 			if(f != nil && (byte*)0 < p && (p < (byte*)PageSize || (uintptr)p == PoisonGC || (uintptr)p == PoisonStack)) {
 				// Looks like a junk value in a pointer slot.
 				// Live analysis wrong?
-				m->traceback = 2;
+				g->m->traceback = 2;
 				runtime·printf("runtime: bad pointer in frame %s at %p: %p\n", runtime·funcname(f), &scanp[i], p);
 				runtime·throw("bad pointer!");
 			}
@@ -675,28 +675,28 @@
 	void *moreargp;
 	bool newstackcall;
 
-	if(m->forkstackguard)
+	if(g->m->forkstackguard)
 		runtime·throw("split stack after fork");
-	if(m->morebuf.g != m->curg) {
+	if(g->m->morebuf.g != g->m->curg) {
 		runtime·printf("runtime: newstack called from g=%p\n"
 			"\tm=%p m->curg=%p m->g0=%p m->gsignal=%p\n",
-			m->morebuf.g, m, m->curg, m->g0, m->gsignal);
+			g->m->morebuf.g, g->m, g->m->curg, g->m->g0, g->m->gsignal);
 		runtime·throw("runtime: wrong goroutine in newstack");
 	}
 
 	// gp->status is usually Grunning, but it could be Gsyscall if a stack overflow
 	// happens during a function call inside entersyscall.
-	gp = m->curg;
+	gp = g->m->curg;
 	oldstatus = gp->status;
 
-	framesize = m->moreframesize;
-	argsize = m->moreargsize;
-	moreargp = m->moreargp;
-	m->moreargp = nil;
-	morebuf = m->morebuf;
-	m->morebuf.pc = (uintptr)nil;
-	m->morebuf.lr = (uintptr)nil;
-	m->morebuf.sp = (uintptr)nil;
+	framesize = g->m->moreframesize;
+	argsize = g->m->moreargsize;
+	moreargp = g->m->moreargp;
+	g->m->moreargp = nil;
+	morebuf = g->m->morebuf;
+	g->m->morebuf.pc = (uintptr)nil;
+	g->m->morebuf.lr = (uintptr)nil;
+	g->m->morebuf.sp = (uintptr)nil;
 	gp->status = Gwaiting;
 	gp->waitreason = "stack growth";
 	newstackcall = framesize==1;
@@ -717,7 +717,7 @@
 			"\tmorebuf={pc:%p sp:%p lr:%p}\n"
 			"\tsched={pc:%p sp:%p lr:%p ctxt:%p}\n",
 			(uintptr)framesize, (uintptr)argsize, sp, gp->stackguard - StackGuard, gp->stackbase,
-			m->morebuf.pc, m->morebuf.sp, m->morebuf.lr,
+			g->m->morebuf.pc, g->m->morebuf.sp, g->m->morebuf.lr,
 			gp->sched.pc, gp->sched.sp, gp->sched.lr, gp->sched.ctxt);
 	}
 	if(sp < gp->stackguard - StackGuard) {
@@ -731,15 +731,15 @@
 	}
 
 	if(gp->stackguard0 == (uintptr)StackPreempt) {
-		if(gp == m->g0)
+		if(gp == g->m->g0)
 			runtime·throw("runtime: preempt g0");
-		if(oldstatus == Grunning && m->p == nil && m->locks == 0)
+		if(oldstatus == Grunning && g->m->p == nil && g->m->locks == 0)
 			runtime·throw("runtime: g is running but p is not");
-		if(oldstatus == Gsyscall && m->locks == 0)
+		if(oldstatus == Gsyscall && g->m->locks == 0)
 			runtime·throw("runtime: stack growth during syscall");
 		// Be conservative about where we preempt.
 		// We are interested in preempting user Go code, not runtime code.
-		if(oldstatus != Grunning || m->locks || m->mallocing || m->gcing || m->p->status != Prunning) {
+		if(oldstatus != Grunning || g->m->locks || g->m->mallocing || g->m->gcing || g->m->p->status != Prunning) {
 			// Let the goroutine keep running for now.
 			// gp->preempt is set, so it will be preempted next time.
 			gp->stackguard0 = gp->stackguard;
@@ -839,9 +839,9 @@
 	runtime·memclr((byte*)&label, sizeof label);
 	label.sp = sp;
 	label.pc = (uintptr)runtime·lessstack;
-	label.g = m->curg;
+	label.g = g->m->curg;
 	if(newstackcall)
-		runtime·gostartcallfn(&label, (FuncVal*)m->cret);
+		runtime·gostartcallfn(&label, (FuncVal*)g->m->cret);
 	else {
 		runtime·gostartcall(&label, (void(*)(void))gp->sched.pc, gp->sched.ctxt);
 		gp->sched.ctxt = nil;
diff --git a/src/pkg/runtime/symtab.goc b/src/pkg/runtime/symtab.goc
index 15e1d28..961b78d 100644
--- a/src/pkg/runtime/symtab.goc
+++ b/src/pkg/runtime/symtab.goc
@@ -316,7 +316,7 @@
 	static int32 traceback = -1;
 	String name;
 
-	if(m->throwing > 0 && gp != nil && (gp == m->curg || gp == m->caughtsig))
+	if(g->m->throwing > 0 && gp != nil && (gp == g->m->curg || gp == g->m->caughtsig))
 		return 1;
 	if(traceback < 0)
 		traceback = runtime·gotraceback(nil);
diff --git a/src/pkg/runtime/sys_darwin_386.s b/src/pkg/runtime/sys_darwin_386.s
index bfaaa00..a702d9b 100644
--- a/src/pkg/runtime/sys_darwin_386.s
+++ b/src/pkg/runtime/sys_darwin_386.s
@@ -236,9 +236,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$40
 	get_tls(CX)
 	
-	// check that m exists
-	MOVL	m(CX), BP
-	CMPL	BP, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	sig+8(FP), BX
 	MOVL	BX, 0(SP)
@@ -247,10 +247,10 @@
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 
 	// g = m->gsignal
+	MOVL	g_m(DI), BP
 	MOVL	m_gsignal(BP), BP
 	MOVL	BP, g(CX)
 
@@ -362,7 +362,7 @@
 	// Now segment is established.  Initialize m, g.
 	get_tls(BP)
 	MOVL	AX, g(BP)
-	MOVL	DX, m(BP)
+	MOVL	DX, g_m(AX)
 	MOVL	BX, m_procid(DX)	// m->procid = thread port (for debuggers)
 	CALL	runtime·stackcheck(SB)		// smashes AX
 	CALL	CX	// fn()
diff --git a/src/pkg/runtime/sys_darwin_amd64.s b/src/pkg/runtime/sys_darwin_amd64.s
index a0c81b5..23995db7 100644
--- a/src/pkg/runtime/sys_darwin_amd64.s
+++ b/src/pkg/runtime/sys_darwin_amd64.s
@@ -196,9 +196,9 @@
 	MOVQ	R8, 32(SP)	// save ucontext
 	MOVQ	SI, 40(SP)	// save infostyle
 
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVL	DX, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -206,10 +206,10 @@
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 48(SP)
 
 	// g = m->gsignal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 
@@ -325,10 +325,10 @@
 	POPQ	DX
 
 	get_tls(BX)
-	MOVQ	CX, m(BX)
 	MOVQ	SI, m_procid(CX)	// thread port is m->procid
 	MOVQ	m_g0(CX), AX
 	MOVQ	AX, g(BX)
+	MOVQ	CX, g_m(AX)
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	CALL	DX	// fn
 	CALL	runtime·exit1(SB)
diff --git a/src/pkg/runtime/sys_dragonfly_386.s b/src/pkg/runtime/sys_dragonfly_386.s
index 20e6999..0b8d219 100644
--- a/src/pkg/runtime/sys_dragonfly_386.s
+++ b/src/pkg/runtime/sys_dragonfly_386.s
@@ -42,7 +42,7 @@
 	
 	// Now segment is established.  Initialize m, g.
 	get_tls(CX)
-	MOVL	BX, m(CX)
+	MOVL	BX, g_m(DX)
 	MOVL	DX, g(CX)
 
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
@@ -201,9 +201,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$44
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
@@ -212,10 +212,10 @@
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 	
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
diff --git a/src/pkg/runtime/sys_dragonfly_amd64.s b/src/pkg/runtime/sys_dragonfly_amd64.s
index d70d2e8..25d2be3 100644
--- a/src/pkg/runtime/sys_dragonfly_amd64.s
+++ b/src/pkg/runtime/sys_dragonfly_amd64.s
@@ -43,8 +43,8 @@
 
 	// set up m, g
 	get_tls(CX)
-	MOVQ	R13, m(CX)
 	MOVQ	m_g0(R13), DI
+	MOVQ	R13, g_m(DI)
 	MOVQ	DI, g(CX)
 
 	CALL	runtime·stackcheck(SB)
@@ -163,9 +163,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 	get_tls(BX)
 
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -173,10 +173,10 @@
 	RET
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
 	
 	// g = m->signal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 	
diff --git a/src/pkg/runtime/sys_freebsd_386.s b/src/pkg/runtime/sys_freebsd_386.s
index 4c97eec..d2ce25f 100644
--- a/src/pkg/runtime/sys_freebsd_386.s
+++ b/src/pkg/runtime/sys_freebsd_386.s
@@ -37,7 +37,7 @@
 	get_tls(CX)
 	MOVL	BX, g(CX)
 	
-	MOVL	AX, m(CX)
+	MOVL	AX, g_m(BX)
 	CALL	runtime·stackcheck(SB)		// smashes AX
 	CALL	runtime·mstart(SB)
 
@@ -183,9 +183,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$44
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
@@ -194,10 +194,10 @@
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 	
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
diff --git a/src/pkg/runtime/sys_freebsd_amd64.s b/src/pkg/runtime/sys_freebsd_amd64.s
index 4c5b325..2c6e335 100644
--- a/src/pkg/runtime/sys_freebsd_amd64.s
+++ b/src/pkg/runtime/sys_freebsd_amd64.s
@@ -60,8 +60,8 @@
 
 	// set up m, g
 	get_tls(CX)
-	MOVQ	R13, m(CX)
 	MOVQ	m_g0(R13), DI
+	MOVQ	R13, g_m(DI)
 	MOVQ	DI, g(CX)
 
 	CALL	runtime·stackcheck(SB)
@@ -184,9 +184,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 	get_tls(BX)
 
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -194,10 +194,10 @@
 	RET
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
 	
 	// g = m->signal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 	
diff --git a/src/pkg/runtime/sys_freebsd_arm.s b/src/pkg/runtime/sys_freebsd_arm.s
index 3ec95a6..dbb2583 100644
--- a/src/pkg/runtime/sys_freebsd_arm.s
+++ b/src/pkg/runtime/sys_freebsd_arm.s
@@ -58,10 +58,9 @@
 	RET
 
 TEXT runtime·thr_start(SB),NOSPLIT,$0
-	MOVW R0, m
-
 	// set up g
-	MOVW m_g0(m), g
+	MOVW m_g0(R0), g
+	MOVW R0, g_m(g)
 	BL runtime·emptyfunc(SB) // fault if stack check is wrong
 	BL runtime·mstart(SB)
 
@@ -196,14 +195,14 @@
 
 TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	// this might be called in external code context,
-	// where g and m are not set.
-	// first save R0, because runtime·load_gm will clobber it
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
 	MOVW	R0, 4(R13) // signum
 	MOVB	runtime·iscgo(SB), R0
 	CMP 	$0, R0
-	BL.NE	runtime·load_gm(SB)
+	BL.NE	runtime·load_g(SB)
 
-	CMP $0, m
+	CMP $0, g
 	BNE 4(PC)
 	// signal number is already prepared in 4(R13)
 	MOVW $runtime·badsignal(SB), R11
@@ -215,7 +214,8 @@
 	MOVW g, 20(R13)
 
 	// g = m->signal
-	MOVW m_gsignal(m), g
+	MOVW g_m(g), R8
+	MOVW m_gsignal(R8), g
 
 	// R0 is already saved
 	MOVW R1, 8(R13) // info
diff --git a/src/pkg/runtime/sys_linux_386.s b/src/pkg/runtime/sys_linux_386.s
index b7896f1..3a8371c 100644
--- a/src/pkg/runtime/sys_linux_386.s
+++ b/src/pkg/runtime/sys_linux_386.s
@@ -166,9 +166,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$44
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	sig+0(FP), BX
 	MOVL	BX, 0(SP)
@@ -177,11 +177,10 @@
 	RET
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 
 	// g = m->gsignal
-	MOVL	m(CX), BX
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
@@ -324,7 +323,7 @@
 	// Now segment is established.  Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
-	MOVL	BX, m(AX)
+	MOVL	BX, g_m(DX)
 
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	MOVL	0(DX), DX	// paranoia; check they are not nil
diff --git a/src/pkg/runtime/sys_linux_amd64.s b/src/pkg/runtime/sys_linux_amd64.s
index b340c4f..03b9d6a 100644
--- a/src/pkg/runtime/sys_linux_amd64.s
+++ b/src/pkg/runtime/sys_linux_amd64.s
@@ -184,9 +184,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 	get_tls(BX)
 
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -194,10 +194,10 @@
 	RET
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
 
 	// g = m->gsignal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 
@@ -301,7 +301,7 @@
 
 	// In child, set up new stack
 	get_tls(CX)
-	MOVQ	R8, m(CX)
+	MOVQ	R8, g_m(R9)
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
diff --git a/src/pkg/runtime/sys_linux_arm.s b/src/pkg/runtime/sys_linux_arm.s
index c537a87..8bfc72b 100644
--- a/src/pkg/runtime/sys_linux_arm.s
+++ b/src/pkg/runtime/sys_linux_arm.s
@@ -244,11 +244,12 @@
 	BEQ	2(PC)
 	BL	runtime·abort(SB)
 
-	MOVW	0(R13), m
 	MOVW	4(R13), g
+	MOVW	0(R13), R8
+	MOVW	R8, g_m(g)
 
 	// paranoia; check they are not nil
-	MOVW	0(m), R0
+	MOVW	0(R8), R0
 	MOVW	0(g), R0
 
 	BL	runtime·emptyfunc(SB)	// fault if stack check is wrong
@@ -256,7 +257,8 @@
 	// Initialize m->procid to Linux tid
 	MOVW	$SYS_gettid, R7
 	SWI	$0
-	MOVW	R0, m_procid(m)
+	MOVW	g_m(g), R8
+	MOVW	R0, m_procid(R8)
 
 	// Call fn
 	MOVW	8(R13), R0
@@ -285,14 +287,14 @@
 
 TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	// this might be called in external code context,
-	// where g and m are not set.
-	// first save R0, because runtime·load_gm will clobber it
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
 	MOVW	R0, 4(R13)
 	MOVB	runtime·iscgo(SB), R0
 	CMP 	$0, R0
-	BL.NE	runtime·load_gm(SB)
+	BL.NE	runtime·load_g(SB)
 
-	CMP 	$0, m
+	CMP 	$0, g
 	BNE 	4(PC)
 	// signal number is already prepared in 4(R13)
 	MOVW  	$runtime·badsignal(SB), R11
@@ -304,7 +306,8 @@
 	MOVW	g, 20(R13)
 
 	// g = m->gsignal
-	MOVW	m_gsignal(m), g
+	MOVW	g_m(g), R8
+	MOVW	m_gsignal(R8), g
 
 	// copy arguments for call to sighandler
 	// R0 is already saved above
diff --git a/src/pkg/runtime/sys_nacl_386.s b/src/pkg/runtime/sys_nacl_386.s
index 42ba0e0..50dca31 100644
--- a/src/pkg/runtime/sys_nacl_386.s
+++ b/src/pkg/runtime/sys_nacl_386.s
@@ -165,21 +165,21 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	$11, BX
-	MOVL	BX, 0(SP)
+	MOVL	$0, 0(SP)
 	MOVL	$runtime·badsignal(SB), AX
 	CALL	AX
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 	
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 	
diff --git a/src/pkg/runtime/sys_nacl_amd64p32.s b/src/pkg/runtime/sys_nacl_amd64p32.s
index 43c1723..d4e32ff 100644
--- a/src/pkg/runtime/sys_nacl_amd64p32.s
+++ b/src/pkg/runtime/sys_nacl_amd64p32.s
@@ -261,18 +261,18 @@
 	MOVL (16*4+5*8)(AX), AX
 	MOVL	AX, TLS
 
-	// check that m exists
+	// check that g exists
 	get_tls(CX)
-	MOVL	m(CX), BX
+	MOVL	g(CX), DI
 	
-	CMPL	BX, $0
-	JEQ	nom
+	CMPL	DI, $0
+	JEQ	nog
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 	
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
@@ -359,7 +359,7 @@
 	MOVL	0, AX
 	RET
 
-nom:
+nog:
 	MOVL	0, AX
 	RET
 
diff --git a/src/pkg/runtime/sys_netbsd_386.s b/src/pkg/runtime/sys_netbsd_386.s
index 05de55e..4a78cb9 100644
--- a/src/pkg/runtime/sys_netbsd_386.s
+++ b/src/pkg/runtime/sys_netbsd_386.s
@@ -194,9 +194,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$44
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
@@ -205,10 +205,10 @@
 	RET
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
@@ -257,7 +257,7 @@
 	// Now segment is established.  Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
-	MOVL	BX, m(AX)
+	MOVL	BX, g_m(DX)
 
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	MOVL	0(DX), DX		// paranoia; check they are not nil
diff --git a/src/pkg/runtime/sys_netbsd_amd64.s b/src/pkg/runtime/sys_netbsd_amd64.s
index fcbced5..13b1cdc 100644
--- a/src/pkg/runtime/sys_netbsd_amd64.s
+++ b/src/pkg/runtime/sys_netbsd_amd64.s
@@ -28,7 +28,7 @@
 
 	// Set up new stack.
 	get_tls(CX)
-	MOVQ	R8, m(CX)
+	MOVQ	R8, g_m(R9)
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
@@ -213,9 +213,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 	get_tls(BX)
 
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -223,10 +223,10 @@
 	RET
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
 
 	// g = m->signal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 
diff --git a/src/pkg/runtime/sys_netbsd_arm.s b/src/pkg/runtime/sys_netbsd_arm.s
index b2eb74e..acf01cf 100644
--- a/src/pkg/runtime/sys_netbsd_arm.s
+++ b/src/pkg/runtime/sys_netbsd_arm.s
@@ -80,7 +80,7 @@
 	RET
 
 TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
-	MOVW R0, m
+	MOVW R0, g_m(R1)
 	MOVW R1, g
 
 	BL runtime·emptyfunc(SB) // fault if stack check is wrong
@@ -200,14 +200,14 @@
 
 TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	// this might be called in external code context,
-	// where g and m are not set.
-	// first save R0, because runtime·load_gm will clobber it
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
 	MOVW	R0, 4(R13) // signum
 	MOVB	runtime·iscgo(SB), R0
 	CMP 	$0, R0
-	BL.NE	runtime·load_gm(SB)
+	BL.NE	runtime·load_g(SB)
 
-	CMP $0, m
+	CMP $0, g
 	BNE 4(PC)
 	// signal number is already prepared in 4(R13)
 	MOVW $runtime·badsignal(SB), R11
@@ -219,7 +219,8 @@
 	MOVW g, 20(R13)
 
 	// g = m->signal
-	MOVW m_gsignal(m), g
+	MOVW g_m(g), R8
+	MOVW m_gsignal(R8), g
 
 	// R0 is already saved
 	MOVW R1, 8(R13) // info
diff --git a/src/pkg/runtime/sys_openbsd_386.s b/src/pkg/runtime/sys_openbsd_386.s
index 8f0da5c..a94c4e4 100644
--- a/src/pkg/runtime/sys_openbsd_386.s
+++ b/src/pkg/runtime/sys_openbsd_386.s
@@ -174,9 +174,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$44
 	get_tls(CX)
 
-	// check that m exists
-	MOVL	m(CX), BX
-	CMPL	BX, $0
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
 	JNE	6(PC)
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
@@ -185,10 +185,10 @@
 	JMP 	sigtramp_ret
 
 	// save g
-	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
 	
 	// g = m->gsignal
+	MOVL	g_m(DI), BX
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
 
@@ -278,7 +278,7 @@
 	// Now segment is established.  Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
-	MOVL	BX, m(AX)
+	MOVL	BX, g_m(DX)
 
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	MOVL	0(DX), DX		// paranoia; check they are not nil
diff --git a/src/pkg/runtime/sys_openbsd_amd64.s b/src/pkg/runtime/sys_openbsd_amd64.s
index b2a6182..dac90ea 100644
--- a/src/pkg/runtime/sys_openbsd_amd64.s
+++ b/src/pkg/runtime/sys_openbsd_amd64.s
@@ -40,7 +40,7 @@
 
 	// In child, set up new stack.
 	get_tls(CX)
-	MOVQ	R8, m(CX)
+	MOVQ	R8, g_m(R9)
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
@@ -204,9 +204,9 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 	get_tls(BX)
 	
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	5(PC)
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -214,10 +214,10 @@
 	RET
 
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
 	
 	// g = m->signal
+	MOVQ	g_m(R10), BP
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 	
diff --git a/src/pkg/runtime/sys_plan9_386.s b/src/pkg/runtime/sys_plan9_386.s
index 143cd2e..5a652ab 100644
--- a/src/pkg/runtime/sys_plan9_386.s
+++ b/src/pkg/runtime/sys_plan9_386.s
@@ -98,7 +98,7 @@
 	// Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
-	MOVL	BX, m(AX)
+	MOVL	BX, g_m(DX)
 
 	// Initialize procid from TOS struct.
 	// TODO: Be explicit and insert a new MOVL _tos(SB), AX here.
@@ -123,8 +123,8 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	get_tls(AX)
 
-	// check that m exists
-	MOVL	m(AX), BX
+	// check that g exists
+	MOVL	g(AX), BX
 	CMPL	BX, $0
 	JNE	3(PC)
 	CALL	runtime·badsignal2(SB) // will exit
@@ -135,6 +135,7 @@
 	MOVL	note+8(SP), DX
 
 	// change stack
+	MOVL	g_m(BX), BX
 	MOVL	m_gsignal(BX), BP
 	MOVL	g_stackbase(BP), BP
 	MOVL	BP, SP
@@ -181,7 +182,8 @@
 // See ../syscall/asm_plan9_386.s:/·Syscall/
 TEXT runtime·errstr(SB),NOSPLIT,$0
 	get_tls(AX)
-	MOVL	m(AX), BX
+	MOVL	g(AX), BX
+	MOVL	g_m(BX), BX
 	MOVL	m_errstr(BX), CX
 	MOVL	CX, 4(SP)
 	MOVL	$ERRMAX, 8(SP)
diff --git a/src/pkg/runtime/sys_plan9_amd64.s b/src/pkg/runtime/sys_plan9_amd64.s
index e60459c..7e8e593 100644
--- a/src/pkg/runtime/sys_plan9_amd64.s
+++ b/src/pkg/runtime/sys_plan9_amd64.s
@@ -133,7 +133,7 @@
 	// Initialize m, g.
 	get_tls(AX)
 	MOVQ	DX, g(AX)
-	MOVQ	BX, m(AX)
+	MOVQ	BX, g_m(DX)
 
 	// Initialize AX from pid in TLS.
 	MOVQ	0(FS), AX
@@ -156,8 +156,8 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	get_tls(AX)
 
-	// check that m exists
-	MOVQ	m(AX), BX
+	// check that g exists
+	MOVQ	g(AX), BX
 	CMPQ	BX, $0
 	JNE	3(PC)
 	CALL	runtime·badsignal2(SB) // will exit
@@ -168,6 +168,7 @@
 	MOVQ	note+16(SP), DX
 
 	// change stack
+	MOVQ	g_m(BX), BX
 	MOVQ	m_gsignal(BX), R10
 	MOVQ	g_stackbase(R10), BP
 	MOVQ	BP, SP
@@ -218,7 +219,8 @@
 // See ../syscall/asm_plan9_386.s:/·Syscall/
 TEXT runtime·errstr(SB),NOSPLIT,$0
 	get_tls(AX)
-	MOVQ	m(AX), BX
+	MOVQ	g(AX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	m_errstr(BX), CX
 	MOVQ	CX, 8(SP)
 	MOVQ	$ERRMAX, 16(SP)
diff --git a/src/pkg/runtime/sys_solaris_amd64.s b/src/pkg/runtime/sys_solaris_amd64.s
index 2151769..dd34e24 100644
--- a/src/pkg/runtime/sys_solaris_amd64.s
+++ b/src/pkg/runtime/sys_solaris_amd64.s
@@ -22,7 +22,8 @@
 	// asmcgocall will put first argument into DI.
 	CALL	DI	// SysV ABI so returns in AX
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	AX,	m_perrno(BX)
 	RET
 
@@ -73,7 +74,8 @@
 	MOVQ	libcall_n(DI), R10
 
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	m_perrno(BX), DX
 	CMPQ	DX, $0
 	JEQ	skiperrno1
@@ -100,7 +102,8 @@
 	MOVQ	DX, libcall_r2(DI)
 
 	get_tls(CX)
-	MOVQ	m(CX), BX
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
 	MOVQ	m_perrno(BX), AX
 	CMPQ	AX, $0
 	JEQ	skiperrno2
@@ -118,7 +121,7 @@
 	// Make TLS entries point at g and m.
 	get_tls(BX)
 	MOVQ	DX, g(BX)
-	MOVQ	DI, m(BX)
+	MOVQ	DI, g_m(DX)
 
 	// Layout new m scheduler stack on os stack.
 	MOVQ	SP, AX
@@ -154,9 +157,9 @@
 	MOVQ	R15, 72(SP)
 
 	get_tls(BX)
-	// check that m exists
-	MOVQ	m(BX), BP
-	CMPQ	BP, $0
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
 	JNE	allgood
 	MOVQ	DI, 0(SP)
 	MOVQ	$runtime·badsignal(SB), AX
@@ -165,13 +168,13 @@
 
 allgood:
 	// save g
-	MOVQ	g(BX), R10
 	MOVQ	R10, 80(SP)
 
 	// Save m->libcall and m->scratch. We need to do this because we
 	// might get interrupted by a signal in runtime·asmcgocall.
 
 	// save m->libcall 
+	MOVQ	g_m(R10), BP
 	LEAQ	m_libcall(BP), R11
 	MOVQ	libcall_fn(R11), R10
 	MOVQ	R10, 88(SP)
@@ -217,7 +220,8 @@
 	CALL	runtime·sighandler(SB)
 
 	get_tls(BX)
-	MOVQ	m(BX), BP
+	MOVQ	g(BX), BP
+	MOVQ	g_m(BP), BP
 	// restore libcall
 	LEAQ	m_libcall(BP), R11
 	MOVQ	88(SP), R10
diff --git a/src/pkg/runtime/sys_windows_386.s b/src/pkg/runtime/sys_windows_386.s
index e0c0631..dc6fc39 100644
--- a/src/pkg/runtime/sys_windows_386.s
+++ b/src/pkg/runtime/sys_windows_386.s
@@ -88,11 +88,10 @@
 
 	// fetch g
 	get_tls(DX)
-	MOVL	m(DX), AX
-	CMPL	AX, $0
+	MOVL	g(DX), DX
+	CMPL	DX, $0
 	JNE	2(PC)
 	CALL	runtime·badsignal2(SB)
-	MOVL	g(DX), DX
 	// call sighandler(ExceptionRecord*, Context*, G*)
 	MOVL	BX, 0(SP)
 	MOVL	CX, 4(SP)
@@ -142,7 +141,6 @@
 
 	LEAL	m_tls(SP), CX
 	MOVL	CX, 0x14(FS)
-	MOVL	SP, m(CX)
 	MOVL	SP, BX
 	SUBL	$g_end, SP		// space for G
 	MOVL	SP, g(CX)
@@ -151,6 +149,8 @@
 	MOVL	SP, 0(SP)
 	MOVL	$g_end, 4(SP)
 	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	LEAL	g_end(SP), BX
+	MOVL	BX, g_m(SP)
 	LEAL	-4096(SP), CX
 	MOVL	CX, g_stackguard(SP)
 	MOVL	DX, g_stackbase(SP)
@@ -260,7 +260,7 @@
 	// Set up tls.
 	LEAL	m_tls(CX), SI
 	MOVL	SI, 0x14(FS)
-	MOVL	CX, m(SI)
+	MOVL	CX, g_m(DX)
 	MOVL	DX, g(SI)
 
 	// Someday the convention will be D is always cleared.
@@ -308,7 +308,8 @@
 	CALL	AX
 	RET
 
-	MOVL	m(CX), BP
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
 
 	// leave pc/sp for cpu profiler
 	MOVL	(SP), SI
@@ -337,7 +338,8 @@
 
 usleep1_ret:
 	get_tls(CX)
-	MOVL	m(CX), BP
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
 	MOVL	$0, m_libcallsp(BP)
 	RET
 
diff --git a/src/pkg/runtime/sys_windows_amd64.s b/src/pkg/runtime/sys_windows_amd64.s
index 9484590..c3db2c1 100644
--- a/src/pkg/runtime/sys_windows_amd64.s
+++ b/src/pkg/runtime/sys_windows_amd64.s
@@ -120,11 +120,10 @@
 
 	// fetch g
 	get_tls(DX)
-	MOVQ	m(DX), AX
-	CMPQ	AX, $0
+	MOVQ	g(DX), DX
+	CMPQ	DX, $0
 	JNE	2(PC)
 	CALL	runtime·badsignal2(SB)
-	MOVQ	g(DX), DX
 	// call sighandler(ExceptionRecord*, Context*, G*)
 	MOVQ	BX, 0(SP)
 	MOVQ	CX, 8(SP)
@@ -176,7 +175,6 @@
 
 	LEAQ	m_tls(SP), CX
 	MOVQ	CX, 0x28(GS)
-	MOVQ	SP, m(CX)
 	MOVQ	SP, BX
 	SUBQ	$g_end, SP		// space for G
 	MOVQ	SP, g(CX)
@@ -185,6 +183,9 @@
 	MOVQ	SP, 0(SP)
 	MOVQ	$g_end, 8(SP)
 	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	LEAQ	g_end(SP), BX
+	MOVQ	BX, g_m(SP)
+
 	LEAQ	-8192(SP), CX
 	MOVQ	CX, g_stackguard(SP)
 	MOVQ	DX, g_stackbase(SP)
@@ -297,7 +298,7 @@
 	// Set up tls.
 	LEAQ	m_tls(CX), SI
 	MOVQ	SI, 0x28(GS)
-	MOVQ	CX, m(SI)
+	MOVQ	CX, g_m(DX)
 	MOVQ	DX, g(SI)
 
 	// Someday the convention will be D is always cleared.
@@ -328,7 +329,8 @@
 	CALL	AX
 	RET
 
-	MOVQ	m(R15), R13
+	MOVQ	g(R15), R13
+	MOVQ	g_m(R13), R13
 
 	// leave pc/sp for cpu profiler
 	MOVQ	(SP), R12
diff --git a/src/pkg/runtime/traceback_arm.c b/src/pkg/runtime/traceback_arm.c
index d15244c..3595002 100644
--- a/src/pkg/runtime/traceback_arm.c
+++ b/src/pkg/runtime/traceback_arm.c
@@ -236,7 +236,7 @@
 				runtime·printf("\t%S:%d", file, line);
 				if(frame.pc > f->entry)
 					runtime·printf(" +%p", (uintptr)(frame.pc - f->entry));
-				if(m->throwing > 0 && gp == m->curg || gotraceback >= 2)
+				if(g->m->throwing > 0 && gp == g->m->curg || gotraceback >= 2)
 					runtime·printf(" fp=%p sp=%p", frame.fp, frame.sp);
 				runtime·printf("\n");
 				nprint++;
diff --git a/src/pkg/runtime/traceback_x86.c b/src/pkg/runtime/traceback_x86.c
index 851504f..f21469b 100644
--- a/src/pkg/runtime/traceback_x86.c
+++ b/src/pkg/runtime/traceback_x86.c
@@ -277,7 +277,7 @@
 				runtime·printf("\t%S:%d", file, line);
 				if(frame.pc > f->entry)
 					runtime·printf(" +%p", (uintptr)(frame.pc - f->entry));
-				if(m->throwing > 0 && gp == m->curg || gotraceback >= 2)
+				if(g->m->throwing > 0 && gp == g->m->curg || gotraceback >= 2)
 					runtime·printf(" fp=%p sp=%p", frame.fp, frame.sp);
 				runtime·printf("\n");
 				nprint++;
diff --git a/src/pkg/runtime/vlop_arm.s b/src/pkg/runtime/vlop_arm.s
index 80f516e..c336406 100644
--- a/src/pkg/runtime/vlop_arm.s
+++ b/src/pkg/runtime/vlop_arm.s
@@ -72,30 +72,25 @@
 	// registers into G, but they do not need to be kept at the 
 	// usual places a goroutine reschedules (at function calls),
 	// so it would be a waste of 132 bytes per G.
-	MOVW	m_locks(m), R1
+	MOVW	g_m(g), R8
+	MOVW	m_locks(R8), R1
 	ADD	$1, R1
-	MOVW	R1, m_locks(m)
+	MOVW	R1, m_locks(R8)
 	MOVW	$1, R1
-	MOVW	R1, m_softfloat(m)
+	MOVW	R1, m_softfloat(R8)
 	BL	runtime·_sfloat2(SB)
-	MOVW	m_locks(m), R1
+	MOVW	g_m(g), R8
+	MOVW	m_locks(R8), R1
 	SUB	$1, R1
-	MOVW	R1, m_locks(m)
+	MOVW	R1, m_locks(R8)
 	MOVW	$0, R1
-	MOVW	R1, m_softfloat(m)
+	MOVW	R1, m_softfloat(R8)
 	MOVW	R0, 0(R13)
 	MOVW	64(R13), R1
 	WORD	$0xe128f001	// msr cpsr_f, r1
 	MOVW	$12(R13), R0
-	// Restore R1-R8 and R11-R12, but ignore the saved R9 (m) and R10 (g).
-	// Both are maintained by the runtime and always have correct values,
-	// so there is no need to restore old values here.
-	// The g should not have changed, but m may have, if we were preempted
-	// and restarted on a different thread, in which case restoring the old
-	// value is incorrect and will cause serious confusion in the runtime.
-	MOVM.IA.W	(R0), [R1-R8]
-	MOVW	$52(R13), R0
-	MOVM.IA.W	(R0), [R11-R12]
+	// Restore R1-R12, R0.
+	MOVM.IA.W	(R0), [R1-R12]
 	MOVW	8(R13), R0
 	RET