all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043
diff --git a/src/pkg/runtime/cgo/asm_arm.s b/src/pkg/runtime/cgo/asm_arm.s
index 850b1c6..b989ab9 100644
--- a/src/pkg/runtime/cgo/asm_arm.s
+++ b/src/pkg/runtime/cgo/asm_arm.s
@@ -14,11 +14,11 @@
 	 *  push 2 args for fn (R1 and R2).
 	 * Also note that at procedure entry in 5c/5g world, 4(R13) will be the
 	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
-	 *  Additionally, runtime·load_gm will clobber R0, so we need to save R0
+	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
 	 *  nevertheless.
 	 */
-	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, m, g, R11, R12, R14], (R13)
-	BL	runtime·load_gm(SB)
+	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	BL	runtime·load_g(SB)
 	MOVW	PC, R14
 	MOVW	0(R13), PC
-	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, m, g, R11, R12, PC]
+	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, PC]
diff --git a/src/pkg/runtime/cgo/callbacks.c b/src/pkg/runtime/cgo/callbacks.c
index e91c8bf..5a4889c 100644
--- a/src/pkg/runtime/cgo/callbacks.c
+++ b/src/pkg/runtime/cgo/callbacks.c
@@ -40,9 +40,9 @@
 
 	ret = runtime·mal(len);
 	c = runtime·mal(sizeof(*c));
-	c->next = m->cgomal;
+	c->next = g->m->cgomal;
 	c->alloc = ret;
-	m->cgomal = c;
+	g->m->cgomal = c;
 	FLUSH(&ret);
 }
 
diff --git a/src/pkg/runtime/cgo/gcc_arm.S b/src/pkg/runtime/cgo/gcc_arm.S
index 17e98d9..336f8ca 100644
--- a/src/pkg/runtime/cgo/gcc_arm.S
+++ b/src/pkg/runtime/cgo/gcc_arm.S
@@ -19,20 +19,19 @@
 .arch armv5t
 
 /*
- * void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void *m, void *g), void *m, void *g)
+ * void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
  * Calling into the 5c tool chain, where all registers are caller save.
  * Called from standard ARM EABI, where r4-r11 are callee-save, so they
  * must be saved explicitly.
  */
-.globl EXT(crosscall_arm2)
-EXT(crosscall_arm2):
+.globl EXT(crosscall_arm1)
+EXT(crosscall_arm1):
 	push {r4, r5, r6, r7, r8, r9, r10, r11, ip, lr}
 	mov r4, r0
 	mov r5, r1
 	mov r0, r2
-	mov r1, r3
-	blx r5 // setmg(m, g) 
+	blx r5 // setmg(g) 
 	blx r4 // fn() 
 	pop {r4, r5, r6, r7, r8, r9, r10, r11, ip, pc}
 
diff --git a/src/pkg/runtime/cgo/gcc_darwin_386.c b/src/pkg/runtime/cgo/gcc_darwin_386.c
index ad9fb5a..d1ef31e 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_386.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_386.c
@@ -8,46 +8,44 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static pthread_key_t k1, k2;
+static pthread_key_t k1;
 
 #define magic1 (0x23581321U)
 
 static void
 inittls(void)
 {
-	uint32 x, y;
+	uint32 x;
 	pthread_key_t tofree[128], k;
 	int i, ntofree;
-	int havek1, havek2;
 
 	/*
-	 * Allocate thread-local storage slots for m, g.
+	 * Allocate thread-local storage slot for g.
 	 * The key numbers start at 0x100, and we expect to be
 	 * one of the early calls to pthread_key_create, so we
-	 * should be able to get pretty low numbers.
+	 * should be able to get a pretty low number.
 	 *
 	 * In Darwin/386 pthreads, %gs points at the thread
 	 * structure, and each key is an index into the thread-local
 	 * storage array that begins at offset 0x48 within in that structure.
 	 * It may happen that we are not quite the first function to try
 	 * to allocate thread-local storage keys, so instead of depending
-	 * on getting 0x100 and 0x101, we try for 0x108 and 0x109,
-	 * allocating keys until we get the ones we want and then freeing
-	 * the ones we didn't want.
+	 * on getting 0x100, we try for 0x108, allocating keys until
+	 * we get the one we want and then freeing the ones we didn't want.
 	 *
-	 * Thus the final offsets to use in %gs references are
-	 * 0x48+4*0x108 = 0x468 and 0x48+4*0x109 = 0x46c.
+	 * Thus the final offset to use in %gs references is
+	 * 0x48+4*0x108 = 0x468.
 	 *
-	 * The linker and runtime hard-code these constant offsets
-	 * from %gs where we expect to find m and g.
-	 * Known to ../../../cmd/8l/obj.c:/468
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/468
 	 * and to ../sys_darwin_386.s:/468
 	 *
 	 * This is truly disgusting and a bit fragile, but taking care
 	 * of it here protects the rest of the system from damage.
 	 * The alternative would be to use a global variable that
 	 * held the offset and refer to that variable each time we
-	 * need a %gs variable (m or g).  That approach would
+	 * need a %gs variable (g).  That approach would
 	 * require an extra instruction and memory reference in
 	 * every stack growth prolog and would also require
 	 * rewriting the code that 8c generates for extern registers.
@@ -63,39 +61,32 @@
 	 * storage until we find a key that writes to the memory location
 	 * we want.  Then keep that key.
 	 */
-	havek1 = 0;
-	havek2 = 0;
 	ntofree = 0;
-	while(!havek1 || !havek2) {
+	for(;;) {
 		if(pthread_key_create(&k, nil) < 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
 		pthread_setspecific(k, (void*)magic1);
 		asm volatile("movl %%gs:0x468, %0" : "=r"(x));
-		asm volatile("movl %%gs:0x46c, %0" : "=r"(y));
-		if(x == magic1) {
-			havek1 = 1;
-			k1 = k;
-		} else if(y == magic1) {
-			havek2 = 1;
-			k2 = k;
-		} else {
-			if(ntofree >= nelem(tofree)) {
-				fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
-				fprintf(stderr, "\ttried");
-				for(i=0; i<ntofree; i++)
-					fprintf(stderr, " %#x", (unsigned)tofree[i]);
-				fprintf(stderr, "\n");
-				abort();
-			}
-			tofree[ntofree++] = k;
-		}
 		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
 	}
 
 	/*
-	 * We got the keys we wanted.  Free the others.
+	 * We got the key we wanted.  Free the others.
 	 */
 	for(i=0; i<ntofree; i++)
 		pthread_key_delete(tofree[i]);
@@ -158,7 +149,6 @@
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
 
 	pthread_setspecific(k1, (void*)ts.g);
-	pthread_setspecific(k2, (void*)ts.m);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_darwin_amd64.c b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
index 65d3816..358a281 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
@@ -8,64 +8,56 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static pthread_key_t k1, k2;
+static pthread_key_t k1;
 
 #define magic1 (0x23581321345589ULL)
 
 static void
 inittls(void)
 {
-	uint64 x, y;
+	uint64 x;
 	pthread_key_t tofree[128], k;
 	int i, ntofree;
-	int havek1, havek2;
 
 	/*
 	 * Same logic, code as darwin_386.c:/inittls, except that words
 	 * are 8 bytes long now, and the thread-local storage starts
-	 * at 0x60 on Leopard / Snow Leopard. So the offsets are
-	 * 0x60+8*0x108 = 0x8a0 and 0x60+8*0x109 = 0x8a8.
+	 * at 0x60 on Leopard / Snow Leopard. So the offset is
+	 * 0x60+8*0x108 = 0x8a0.
 	 *
-	 * The linker and runtime hard-code these constant offsets
-	 * from %gs where we expect to find m and g.
-	 * Known to ../../../cmd/6l/obj.c:/8a0
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/8a0
 	 * and to ../sys_darwin_amd64.s:/8a0
 	 *
 	 * As disgusting as on the 386; same justification.
 	 */
-	havek1 = 0;
-	havek2 = 0;
 	ntofree = 0;
-	while(!havek1 || !havek2) {
+	for(;;) {
 		if(pthread_key_create(&k, nil) < 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
 		pthread_setspecific(k, (void*)magic1);
 		asm volatile("movq %%gs:0x8a0, %0" : "=r"(x));
-		asm volatile("movq %%gs:0x8a8, %0" : "=r"(y));
-		if(x == magic1) {
-			havek1 = 1;
-			k1 = k;
-		} else if(y == magic1) {
-			havek2 = 1;
-			k2 = k;
-		} else {
-			if(ntofree >= nelem(tofree)) {
-				fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
-				fprintf(stderr, "\ttried");
-				for(i=0; i<ntofree; i++)
-					fprintf(stderr, " %#x", (unsigned)tofree[i]);
-				fprintf(stderr, "\n");
-				abort();
-			}
-			tofree[ntofree++] = k;
-		}
 		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
 	}
 
 	/*
-	 * We got the keys we wanted.  Free the others.
+	 * We got the key we wanted.  Free the others.
 	 */
 	for(i=0; i<ntofree; i++)
 		pthread_key_delete(tofree[i]);
@@ -128,7 +120,6 @@
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
 
 	pthread_setspecific(k1, (void*)ts.g);
-	pthread_setspecific(k2, (void*)ts.m);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_dragonfly_386.c b/src/pkg/runtime/cgo/gcc_dragonfly_386.c
index 695c166..6af61ac 100644
--- a/src/pkg/runtime/cgo/gcc_dragonfly_386.c
+++ b/src/pkg/runtime/cgo/gcc_dragonfly_386.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c b/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
index a46c121..a29d522 100644
--- a/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_dragonfly_amd64.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_386.c b/src/pkg/runtime/cgo/gcc_freebsd_386.c
index 695c166..6af61ac 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_386.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
index a46c121..a29d522 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
@@ -10,15 +10,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_arm.c b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
index 6175e1d..16530f0 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_arm.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
@@ -21,15 +21,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -67,7 +67,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void *g, void *m);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -84,6 +84,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_linux_386.c b/src/pkg/runtime/cgo/gcc_linux_386.c
index 0a46c9b..5b282c9 100644
--- a/src/pkg/runtime/cgo/gcc_linux_386.c
+++ b/src/pkg/runtime/cgo/gcc_linux_386.c
@@ -8,15 +8,15 @@
 #include "libcgo.h"
 
 static void *threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -73,7 +73,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_linux_amd64.c b/src/pkg/runtime/cgo/gcc_linux_amd64.c
index c530183b..19ca580 100644
--- a/src/pkg/runtime/cgo/gcc_linux_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_linux_amd64.c
@@ -8,15 +8,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G* g, void (*setmg)(void*, void*))
+x_cgo_init(G* g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -68,7 +68,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_linux_arm.c b/src/pkg/runtime/cgo/gcc_linux_arm.c
index 0325681..3b108fe 100644
--- a/src/pkg/runtime/cgo/gcc_linux_arm.c
+++ b/src/pkg/runtime/cgo/gcc_linux_arm.c
@@ -9,15 +9,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -55,7 +55,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void*, void*);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -72,6 +72,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_386.c b/src/pkg/runtime/cgo/gcc_netbsd_386.c
index 28690cc..a2b7ef3 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_386.c
@@ -9,15 +9,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -69,7 +69,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_amd64.c b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
index 6e0482d..ccd08b7 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
@@ -9,15 +9,15 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -70,7 +70,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_arm.c b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
index ba2ae25..5c0603d 100644
--- a/src/pkg/runtime/cgo/gcc_netbsd_arm.c
+++ b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
@@ -10,15 +10,15 @@
 
 static void *threadentry(void*);
 
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -51,7 +51,7 @@
 	}
 }
 
-extern void crosscall_arm2(void (*fn)(void), void (*setmg_gcc)(void*, void*), void *g, void *m);
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
 static void*
 threadentry(void *v)
 {
@@ -68,6 +68,6 @@
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
 
-	crosscall_arm2(ts.fn, setmg_gcc, (void*)ts.m, (void*)ts.g);
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_386.c b/src/pkg/runtime/cgo/gcc_openbsd_386.c
index e682c37..48b4bc7 100644
--- a/src/pkg/runtime/cgo/gcc_openbsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_openbsd_386.c
@@ -11,7 +11,7 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 // TCB_SIZE is sizeof(struct thread_control_block),
 // as defined in /usr/src/lib/librthread/tcb.h
@@ -83,13 +83,13 @@
 }
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 	void *handle;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -158,7 +158,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_amd64.c b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
index 64d29a9..5f0d3bb 100644
--- a/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
@@ -11,7 +11,7 @@
 #include "libcgo.h"
 
 static void* threadentry(void*);
-static void (*setmg_gcc)(void*, void*);
+static void (*setg_gcc)(void*);
 
 // TCB_SIZE is sizeof(struct thread_control_block),
 // as defined in /usr/src/lib/librthread/tcb.h
@@ -83,13 +83,13 @@
 }
 
 void
-x_cgo_init(G *g, void (*setmg)(void*, void*))
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 	void *handle;
 
-	setmg_gcc = setmg;
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stackguard = (uintptr)&attr - size + 4096;
@@ -159,7 +159,7 @@
 	/*
 	 * Set specific keys.
 	 */
-	setmg_gcc((void*)ts.m, (void*)ts.g);
+	setg_gcc((void*)ts.g);
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/pkg/runtime/cgo/gcc_windows_386.c b/src/pkg/runtime/cgo/gcc_windows_386.c
index cdc8664..0935b74 100644
--- a/src/pkg/runtime/cgo/gcc_windows_386.c
+++ b/src/pkg/runtime/cgo/gcc_windows_386.c
@@ -54,8 +54,7 @@
 		"movl %0, %%fs:0x14\n"	// MOVL tls0, 0x14(FS)
 		"movl %%fs:0x14, %%eax\n"	// MOVL 0x14(FS), tmp
 		"movl %1, 0(%%eax)\n"	// MOVL g, 0(FS)
-		"movl %2, 4(%%eax)\n"	// MOVL m, 4(FS)
-		:: "r"(ts.tls), "r"(ts.g), "r"(ts.m) : "%eax"
+		:: "r"(ts.tls), "r"(ts.g) : "%eax"
 	);
 	
 	crosscall_386(ts.fn);
diff --git a/src/pkg/runtime/cgo/gcc_windows_amd64.c b/src/pkg/runtime/cgo/gcc_windows_amd64.c
index d8dd69b..4a2540a 100644
--- a/src/pkg/runtime/cgo/gcc_windows_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_windows_amd64.c
@@ -54,8 +54,7 @@
 	  "movq %0, %%gs:0x28\n"	// MOVL tls0, 0x28(GS)
 	  "movq %%gs:0x28, %%rax\n" // MOVQ 0x28(GS), tmp
 	  "movq %1, 0(%%rax)\n" // MOVQ g, 0(GS)
-	  "movq %2, 8(%%rax)\n" // MOVQ m, 8(GS)
-	  :: "r"(ts.tls), "r"(ts.g), "r"(ts.m) : "%rax"
+	  :: "r"(ts.tls), "r"(ts.g) : "%rax"
 	);
 
 	crosscall_amd64(ts.fn);
diff --git a/src/pkg/runtime/cgo/libcgo.h b/src/pkg/runtime/cgo/libcgo.h
index 65ea3f3..251fb4c 100644
--- a/src/pkg/runtime/cgo/libcgo.h
+++ b/src/pkg/runtime/cgo/libcgo.h
@@ -32,7 +32,6 @@
 typedef struct ThreadStart ThreadStart;
 struct ThreadStart
 {
-	uintptr m;
 	G *g;
 	uintptr *tls;
 	void (*fn)(void);