src/pkg/runtime/proc.c - go - Git at Google

 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
 #include "stack.h"
 #include "race.h"
 #include "type.h"

 bool	runtime·iscgo;

 static void schedule(G*);

 typedef struct Sched Sched;

 M	runtime·m0;
 G	runtime·g0;	// idle goroutine for m0

 static	int32	debug	= 0;

 int32	runtime·gcwaiting;

 G*	runtime·allg;
 G*	runtime·lastg;
 M*	runtime·allm;
 M*	runtime·extram;

 int8*	runtime·goos;
 int32	runtime·ncpu;

 // Go scheduler
 //
 // The go scheduler's job is to match ready-to-run goroutines (`g's)
 // with waiting-for-work schedulers (`m's).  If there are ready g's
 // and no waiting m's, ready() will start a new m running in a new
 // OS thread, so that all ready g's can run simultaneously, up to a limit.
 // For now, m's never go away.
 //
 // By default, Go keeps only one kernel thread (m) running user code
 // at a single time; other threads may be blocked in the operating system.
 // Setting the environment variable $GOMAXPROCS or calling
 // runtime.GOMAXPROCS() will change the number of user threads
 // allowed to execute simultaneously.  $GOMAXPROCS is thus an
 // approximation of the maximum number of cores to use.
 //
 // Even a program that can run without deadlock in a single process
 // might use more m's if given the chance.  For example, the prime
 // sieve will use as many m's as there are primes (up to runtime·sched.mmax),
 // allowing different stages of the pipeline to execute in parallel.
 // We could revisit this choice, only kicking off new m's for blocking
 // system calls, but that would limit the amount of parallel computation
 // that go would try to do.
 //
 // In general, one could imagine all sorts of refinements to the
 // scheduler, but the goal now is just to get something working on
 // Linux and OS X.

 struct Sched {
 	Lock;

 	G *gfree;	// available g's (status == Gdead)
 	int64 goidgen;

 	G *ghead;	// g's waiting to run
 	G *gtail;
 	int32 gwait;	// number of g's waiting to run
 	int32 gcount;	// number of g's that are alive
 	int32 grunning;	// number of g's running on cpu or in syscall

 	M *mhead;	// m's waiting for work
 	int32 mwait;	// number of m's waiting for work
 	int32 mcount;	// number of m's that have been created

 	volatile uint32 atomic;	// atomic scheduling word (see below)

 	int32 profilehz;	// cpu profiling rate

 	bool init;  // running initialization

 	Note	stopped;	// one g can set waitstop and wait here for m's to stop
 };

 // The atomic word in sched is an atomic uint32 that
 // holds these fields.
 //
 //	[15 bits] mcpu		number of m's executing on cpu
 //	[15 bits] mcpumax	max number of m's allowed on cpu
 //	[1 bit] waitstop	some g is waiting on stopped
 //	[1 bit] gwaiting	gwait != 0
 //
 // These fields are the information needed by entersyscall
 // and exitsyscall to decide whether to coordinate with the
 // scheduler.  Packing them into a single machine word lets
 // them use a fast path with a single atomic read/write and
 // no lock/unlock.  This greatly reduces contention in
 // syscall- or cgo-heavy multithreaded programs.
 //
 // Except for entersyscall and exitsyscall, the manipulations
 // to these fields only happen while holding the schedlock,
 // so the routines holding schedlock only need to worry about
 // what entersyscall and exitsyscall do, not the other routines
 // (which also use the schedlock).
 //
 // In particular, entersyscall and exitsyscall only read mcpumax,
 // waitstop, and gwaiting.  They never write them.  Thus, writes to those
 // fields can be done (holding schedlock) without fear of write conflicts.
 // There may still be logic conflicts: for example, the set of waitstop must
 // be conditioned on mcpu >= mcpumax or else the wait may be a
 // spurious sleep.  The Promela model in proc.p verifies these accesses.
 enum {
 	mcpuWidth = 15,
 	mcpuMask = (1<<mcpuWidth) - 1,
 	mcpuShift = 0,
 	mcpumaxShift = mcpuShift + mcpuWidth,
 	waitstopShift = mcpumaxShift + mcpuWidth,
 	gwaitingShift = waitstopShift+1,

 	// The max value of GOMAXPROCS is constrained
 	// by the max value we can store in the bit fields
 	// of the atomic word.  Reserve a few high values
 	// so that we can detect accidental decrement
 	// beyond zero.
 	maxgomaxprocs = mcpuMask - 10,
 };

 #define atomic_mcpu(v)		(((v)>>mcpuShift)&mcpuMask)
 #define atomic_mcpumax(v)	(((v)>>mcpumaxShift)&mcpuMask)
 #define atomic_waitstop(v)	(((v)>>waitstopShift)&1)
 #define atomic_gwaiting(v)	(((v)>>gwaitingShift)&1)

 Sched runtime·sched;
 int32 runtime·gomaxprocs;
 bool runtime·singleproc;

 static bool canaddmcpu(void);

 // An m that is waiting for notewakeup(&m->havenextg).  This may
 // only be accessed while the scheduler lock is held.  This is used to
 // minimize the number of times we call notewakeup while the scheduler
 // lock is held, since the m will normally move quickly to lock the
 // scheduler itself, producing lock contention.
 static M* mwakeup;

 // Scheduling helpers.  Sched must be locked.
 static void gput(G*);	// put/get on ghead/gtail
 static G* gget(void);
 static void mput(M*);	// put/get on mhead
 static M* mget(G*);
 static void gfput(G*);	// put/get on gfree
 static G* gfget(void);
 static void matchmg(void);	// match m's to g's
 static void readylocked(G*);	// ready, but sched is locked
 static void mnextg(M*, G*);
 static void mcommoninit(M*);
 static void runqput(P*, G*);
 static G* runqget(P*);
 static void runqgrow(P*);
 static G* runqsteal(P*, P*);

 void
 setmcpumax(uint32 n)
 {
 	uint32 v, w;

 	for(;;) {
 		v = runtime·sched.atomic;
 		w = v;
 		w &= ~(mcpuMask<<mcpumaxShift);
 		w |= n<<mcpumaxShift;
 		if(runtime·cas(&runtime·sched.atomic, v, w))
 			break;
 	}
 }

 // Keep trace of scavenger's goroutine for deadlock detection.
 static G *scvg;

 // The bootstrap sequence is:
 //
 //	call osinit
 //	call schedinit
 //	make & queue new G
 //	call runtime·mstart
 //
 // The new G calls runtime·main.
 void
 runtime·schedinit(void)
 {
 	int32 n;
 	byte *p;

 	m->nomemprof++;
 	runtime·mprofinit();
 	runtime·mallocinit();
 	mcommoninit(m);

 	runtime·goargs();
 	runtime·goenvs();

 	// For debugging:
 	// Allocate internal symbol table representation now,
 	// so that we don't need to call malloc when we crash.
 	// runtime·findfunc(0);

 	runtime·gomaxprocs = 1;
 	p = runtime·getenv("GOMAXPROCS");
 	if(p != nil && (n = runtime·atoi(p)) != 0) {
 		if(n > maxgomaxprocs)
 			n = maxgomaxprocs;
 		runtime·gomaxprocs = n;
 	}
 	// wait for the main goroutine to start before taking
 	// GOMAXPROCS into account.
 	setmcpumax(1);
 	runtime·singleproc = runtime·gomaxprocs == 1;

 	canaddmcpu();	// mcpu++ to account for bootstrap m
 	m->helpgc = 1;	// flag to tell schedule() to mcpu--
 	runtime·sched.grunning++;

 	mstats.enablegc = 1;
 	m->nomemprof--;

 	if(raceenabled)
 		g->racectx = runtime·raceinit();
 }

 extern void main·init(void);
 extern void main·main(void);

 static FuncVal scavenger = {runtime·MHeap_Scavenger};

 // The main goroutine.
 void
 runtime·main(void)
 {
 	// Lock the main goroutine onto this, the main OS thread,
 	// during initialization.  Most programs won't care, but a few
 	// do require certain calls to be made by the main thread.
 	// Those can arrange for main.main to run in the main thread
 	// by calling runtime.LockOSThread during initialization
 	// to preserve the lock.
 	runtime·lockOSThread();
 	if(m != &runtime·m0)
 		runtime·throw("runtime·main not on m0");
 	// From now on, newgoroutines may use non-main threads.
 	setmcpumax(runtime·gomaxprocs);
 	runtime·sched.init = true;
 	scvg = runtime·newproc1(&scavenger, nil, 0, 0, runtime·main);
 	scvg->issystem = true;
 	// The deadlock detection has false negatives.
 	// Let scvg start up, to eliminate the false negative
 	// for the trivial program func main() { select{} }.
 	runtime·gosched();
 	main·init();
 	runtime·sched.init = false;
 	runtime·unlockOSThread();

 	main·main();
 	if(raceenabled)
 		runtime·racefini();

 	// Make racy client program work: if panicking on
 	// another goroutine at the same time as main returns,
 	// let the other goroutine finish printing the panic trace.
 	// Once it does, it will exit. See issue 3934.
 	if(runtime·panicking)
 		runtime·park(nil, nil, "panicwait");

 	runtime·exit(0);
 	for(;;)
 		*(int32*)runtime·main = 0;
 }

 // Lock the scheduler.
 static void
 schedlock(void)
 {
 	runtime·lock(&runtime·sched);
 }

 // Unlock the scheduler.
 static void
 schedunlock(void)
 {
 	M *mp;

 	mp = mwakeup;
 	mwakeup = nil;
 	runtime·unlock(&runtime·sched);
 	if(mp != nil)
 		runtime·notewakeup(&mp->havenextg);
 }

 void
 runtime·goexit(void)
 {
 	if(raceenabled)
 		runtime·racegoend();
 	g->status = Gmoribund;
 	runtime·gosched();
 }

 void
 runtime·goroutineheader(G *gp)
 {
 	int8 *status;

 	switch(gp->status) {
 	case Gidle:
 		status = "idle";
 		break;
 	case Grunnable:
 		status = "runnable";
 		break;
 	case Grunning:
 		status = "running";
 		break;
 	case Gsyscall:
 		status = "syscall";
 		break;
 	case Gwaiting:
 		if(gp->waitreason)
 			status = gp->waitreason;
 		else
 			status = "waiting";
 		break;
 	case Gmoribund:
 		status = "moribund";
 		break;
 	default:
 		status = "???";
 		break;
 	}
 	runtime·printf("goroutine %D [%s]:\n", gp->goid, status);
 }

 void
 runtime·tracebackothers(G *me)
 {
 	G *gp;
 	int32 traceback;

 	traceback = runtime·gotraceback();
 	for(gp = runtime·allg; gp != nil; gp = gp->alllink) {
 		if(gp == me || gp->status == Gdead)
 			continue;
 		if(gp->issystem && traceback < 2)
 			continue;
 		runtime·printf("\n");
 		runtime·goroutineheader(gp);
 		runtime·traceback(gp->sched.pc, (byte*)gp->sched.sp, 0, gp);
 	}
 }

 // Mark this g as m's idle goroutine.
 // This functionality might be used in environments where programs
 // are limited to a single thread, to simulate a select-driven
 // network server.  It is not exposed via the standard runtime API.
 void
 runtime·idlegoroutine(void)
 {
 	if(g->idlem != nil)
 		runtime·throw("g is already an idle goroutine");
 	g->idlem = m;
 }

 static void
 mcommoninit(M *mp)
 {
 	mp->id = runtime·sched.mcount++;
 	mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();

 	if(mp->mcache == nil)
 		mp->mcache = runtime·allocmcache();

 	runtime·callers(1, mp->createstack, nelem(mp->createstack));

 	runtime·mpreinit(mp);

 	// Add to runtime·allm so garbage collector doesn't free m
 	// when it is just in a register or thread-local storage.
 	mp->alllink = runtime·allm;
 	// runtime·NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
 	runtime·atomicstorep(&runtime·allm, mp);
 }

 // Try to increment mcpu.  Report whether succeeded.
 static bool
 canaddmcpu(void)
 {
 	uint32 v;

 	for(;;) {
 		v = runtime·sched.atomic;
 		if(atomic_mcpu(v) >= atomic_mcpumax(v))
 			return 0;
 		if(runtime·cas(&runtime·sched.atomic, v, v+(1<<mcpuShift)))
 			return 1;
 	}
 }

 // Put on `g' queue.  Sched must be locked.
 static void
 gput(G *gp)
 {
 	// If g is the idle goroutine for an m, hand it off.
 	if(gp->idlem != nil) {
 		if(gp->idlem->idleg != nil) {
 			runtime·printf("m%d idle out of sync: g%D g%D\n",
 				gp->idlem->id,
 				gp->idlem->idleg->goid, gp->goid);
 			runtime·throw("runtime: double idle");
 		}
 		gp->idlem->idleg = gp;
 		return;
 	}

 	gp->schedlink = nil;
 	if(runtime·sched.ghead == nil)
 		runtime·sched.ghead = gp;
 	else
 		runtime·sched.gtail->schedlink = gp;
 	runtime·sched.gtail = gp;

 	// increment gwait.
 	// if it transitions to nonzero, set atomic gwaiting bit.
 	if(runtime·sched.gwait++ == 0)
 		runtime·xadd(&runtime·sched.atomic, 1<<gwaitingShift);
 }

 // Report whether gget would return something.
 static bool
 haveg(void)
 {
 	return runtime·sched.ghead != nil || m->idleg != nil;
 }

 // Get from `g' queue.  Sched must be locked.
 static G*
 gget(void)
 {
 	G *gp;

 	gp = runtime·sched.ghead;
 	if(gp) {
 		runtime·sched.ghead = gp->schedlink;
 		if(runtime·sched.ghead == nil)
 			runtime·sched.gtail = nil;
 		// decrement gwait.
 		// if it transitions to zero, clear atomic gwaiting bit.
 		if(--runtime·sched.gwait == 0)
 			runtime·xadd(&runtime·sched.atomic, -1<<gwaitingShift);
 	} else if(m->idleg != nil) {
 		gp = m->idleg;
 		m->idleg = nil;
 	}
 	return gp;
 }

 // Put on `m' list.  Sched must be locked.
 static void
 mput(M *mp)
 {
 	mp->schedlink = runtime·sched.mhead;
 	runtime·sched.mhead = mp;
 	runtime·sched.mwait++;
 }

 // Get an `m' to run `g'.  Sched must be locked.
 static M*
 mget(G *gp)
 {
 	M *mp;

 	// if g has its own m, use it.
 	if(gp && (mp = gp->lockedm) != nil)
 		return mp;

 	// otherwise use general m pool.
 	if((mp = runtime·sched.mhead) != nil) {
 		runtime·sched.mhead = mp->schedlink;
 		runtime·sched.mwait--;
 	}
 	return mp;
 }

 // Mark g ready to run.
 void
 runtime·ready(G *gp)
 {
 	schedlock();
 	readylocked(gp);
 	schedunlock();
 }

 // Mark g ready to run.  Sched is already locked.
 // G might be running already and about to stop.
 // The sched lock protects g->status from changing underfoot.
 static void
 readylocked(G *gp)
 {
 	if(gp->m) {
 		// Running on another machine.
 		// Ready it when it stops.
 		gp->readyonstop = 1;
 		return;
 	}

 	// Mark runnable.
 	if(gp->status == Grunnable || gp->status == Grunning) {
 		runtime·printf("goroutine %D has status %d\n", gp->goid, gp->status);
 		runtime·throw("bad g->status in ready");
 	}
 	gp->status = Grunnable;

 	gput(gp);
 	matchmg();
 }

 static void
 nop(void)
 {
 }

 // Same as readylocked but a different symbol so that
 // debuggers can set a breakpoint here and catch all
 // new goroutines.
 static void
 newprocreadylocked(G *gp)
 {
 	nop();	// avoid inlining in 6l
 	readylocked(gp);
 }

 // Pass g to m for running.
 // Caller has already incremented mcpu.
 static void
 mnextg(M *mp, G *gp)
 {
 	runtime·sched.grunning++;
 	mp->nextg = gp;
 	if(mp->waitnextg) {
 		mp->waitnextg = 0;
 		if(mwakeup != nil)
 			runtime·notewakeup(&mwakeup->havenextg);
 		mwakeup = mp;
 	}
 }

 // Get the next goroutine that m should run.
 // Sched must be locked on entry, is unlocked on exit.
 // Makes sure that at most $GOMAXPROCS g's are
 // running on cpus (not in system calls) at any given time.
 static G*
 nextgandunlock(void)
 {
 	G *gp;
 	uint32 v;

 top:
 	if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs)
 		runtime·throw("negative mcpu");

 	// If there is a g waiting as m->nextg, the mcpu++
 	// happened before it was passed to mnextg.
 	if(m->nextg != nil) {
 		gp = m->nextg;
 		m->nextg = nil;
 		schedunlock();
 		return gp;
 	}

 	if(m->lockedg != nil) {
 		// We can only run one g, and it's not available.
 		// Make sure some other cpu is running to handle
 		// the ordinary run queue.
 		if(runtime·sched.gwait != 0) {
 			matchmg();
 			// m->lockedg might have been on the queue.
 			if(m->nextg != nil) {
 				gp = m->nextg;
 				m->nextg = nil;
 				schedunlock();
 				return gp;
 			}
 		}
 	} else {
 		// Look for work on global queue.
 		while(haveg() && canaddmcpu()) {
 			gp = gget();
 			if(gp == nil)
 				runtime·throw("gget inconsistency");

 			if(gp->lockedm) {
 				mnextg(gp->lockedm, gp);
 				continue;
 			}
 			runtime·sched.grunning++;
 			schedunlock();
 			return gp;
 		}

 		// The while loop ended either because the g queue is empty
 		// or because we have maxed out our m procs running go
 		// code (mcpu >= mcpumax).  We need to check that
 		// concurrent actions by entersyscall/exitsyscall cannot
 		// invalidate the decision to end the loop.
 		//
 		// We hold the sched lock, so no one else is manipulating the
 		// g queue or changing mcpumax.  Entersyscall can decrement
 		// mcpu, but if does so when there is something on the g queue,
 		// the gwait bit will be set, so entersyscall will take the slow path
 		// and use the sched lock.  So it cannot invalidate our decision.
 		//
 		// Wait on global m queue.
 		mput(m);
 	}

 	// Look for deadlock situation.
 	// There is a race with the scavenger that causes false negatives:
 	// if the scavenger is just starting, then we have
 	//	scvg != nil && grunning == 0 && gwait == 0
 	// and we do not detect a deadlock.  It is possible that we should
 	// add that case to the if statement here, but it is too close to Go 1
 	// to make such a subtle change.  Instead, we work around the
 	// false negative in trivial programs by calling runtime.gosched
 	// from the main goroutine just before main.main.
 	// See runtime·main above.
 	//
 	// On a related note, it is also possible that the scvg == nil case is
 	// wrong and should include gwait, but that does not happen in
 	// standard Go programs, which all start the scavenger.
 	//
 	if((scvg == nil && runtime·sched.grunning == 0) ||
 	   (scvg != nil && runtime·sched.grunning == 1 && runtime·sched.gwait == 0 &&
 	    (scvg->status == Grunning || scvg->status == Gsyscall))) {
 		m->throwing = -1;  // do not dump full stacks
 		runtime·throw("all goroutines are asleep - deadlock!");
 	}

 	m->nextg = nil;
 	m->waitnextg = 1;
 	runtime·noteclear(&m->havenextg);

 	// Stoptheworld is waiting for all but its cpu to go to stop.
 	// Entersyscall might have decremented mcpu too, but if so
 	// it will see the waitstop and take the slow path.
 	// Exitsyscall never increments mcpu beyond mcpumax.
 	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		// set waitstop = 0 (known to be 1)
 		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
 		runtime·notewakeup(&runtime·sched.stopped);
 	}
 	schedunlock();

 	runtime·notesleep(&m->havenextg);
 	if(m->helpgc) {
 		runtime·gchelper();
 		m->helpgc = 0;
 		runtime·lock(&runtime·sched);
 		goto top;
 	}
 	if((gp = m->nextg) == nil)
 		runtime·throw("bad m->nextg in nextgoroutine");
 	m->nextg = nil;
 	return gp;
 }

 int32
 runtime·gcprocs(void)
 {
 	int32 n;

 	// Figure out how many CPUs to use during GC.
 	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
 	runtime·lock(&runtime·sched);
 	n = runtime·gomaxprocs;
 	if(n > runtime·ncpu)
 		n = runtime·ncpu;
 	if(n > MaxGcproc)
 		n = MaxGcproc;
 	if(n > runtime·sched.mwait+1) // one M is currently running
 		n = runtime·sched.mwait+1;
 	runtime·unlock(&runtime·sched);
 	return n;
 }

 static bool
 needaddgcproc(void)
 {
 	int32 n;

 	runtime·lock(&runtime·sched);
 	n = runtime·gomaxprocs;
 	if(n > runtime·ncpu)
 		n = runtime·ncpu;
 	if(n > MaxGcproc)
 		n = MaxGcproc;
 	n -= runtime·sched.mwait+1; // one M is currently running
 	runtime·unlock(&runtime·sched);
 	return n > 0;
 }

 void
 runtime·helpgc(int32 nproc)
 {
 	M *mp;
 	int32 n;

 	runtime·lock(&runtime·sched);
 	for(n = 1; n < nproc; n++) { // one M is currently running
 		mp = mget(nil);
 		if(mp == nil)
 			runtime·throw("runtime·gcprocs inconsistency");
 		mp->helpgc = 1;
 		mp->waitnextg = 0;
 		runtime·notewakeup(&mp->havenextg);
 	}
 	runtime·unlock(&runtime·sched);
 }

 void
 runtime·stoptheworld(void)
 {
 	uint32 v;

 	schedlock();
 	runtime·gcwaiting = 1;

 	setmcpumax(1);

 	// while mcpu > 1
 	for(;;) {
 		v = runtime·sched.atomic;
 		if(atomic_mcpu(v) <= 1)
 			break;

 		// It would be unsafe for multiple threads to be using
 		// the stopped note at once, but there is only
 		// ever one thread doing garbage collection.
 		runtime·noteclear(&runtime·sched.stopped);
 		if(atomic_waitstop(v))
 			runtime·throw("invalid waitstop");

 		// atomic { waitstop = 1 }, predicated on mcpu <= 1 check above
 		// still being true.
 		if(!runtime·cas(&runtime·sched.atomic, v, v+(1<<waitstopShift)))
 			continue;

 		schedunlock();
 		runtime·notesleep(&runtime·sched.stopped);
 		schedlock();
 	}
 	runtime·singleproc = runtime·gomaxprocs == 1;
 	schedunlock();
 }

 void
 runtime·starttheworld(void)
 {
 	M *mp;
 	bool add;

 	add = needaddgcproc();
 	schedlock();
 	runtime·gcwaiting = 0;
 	setmcpumax(runtime·gomaxprocs);
 	matchmg();
 	if(add && canaddmcpu()) {
 		// If GC could have used another helper proc, start one now,
 		// in the hope that it will be available next time.
 		// It would have been even better to start it before the collection,
 		// but doing so requires allocating memory, so it's tricky to
 		// coordinate.  This lazy approach works out in practice:
 		// we don't mind if the first couple gc rounds don't have quite
 		// the maximum number of procs.
 		// canaddmcpu above did mcpu++
 		// (necessary, because m will be doing various
 		// initialization work so is definitely running),
 		// but m is not running a specific goroutine,
 		// so set the helpgc flag as a signal to m's
 		// first schedule(nil) to mcpu-- and grunning--.
 		mp = runtime·newm();
 		mp->helpgc = 1;
 		runtime·sched.grunning++;
 	}
 	schedunlock();
 }

 // Called to start an M.
 void
 runtime·mstart(void)
 {
 	// It is used by windows-386 only. Unfortunately, seh needs
 	// to be located on os stack, and mstart runs on os stack
 	// for both m0 and m.
 	SEH seh;

 	if(g != m->g0)
 		runtime·throw("bad runtime·mstart");

 	// Record top of stack for use by mcall.
 	// Once we call schedule we're never coming back,
 	// so other calls can reuse this stack space.
 	runtime·gosave(&m->g0->sched);
 	m->g0->sched.pc = (void*)-1;  // make sure it is never used
 	m->seh = &seh;
 	runtime·asminit();
 	runtime·minit();

 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
 	if(m == &runtime·m0) {
 		runtime·initsig();
 		if(runtime·iscgo)
 			runtime·newextram();
 	}

 	schedule(nil);

 	// TODO(brainman): This point is never reached, because scheduler
 	// does not release os threads at the moment. But once this path
 	// is enabled, we must remove our seh here.
 }

 // When running with cgo, we call libcgo_thread_start
 // to start threads for us so that we can play nicely with
 // foreign code.
 void (*libcgo_thread_start)(void*);

 typedef struct CgoThreadStart CgoThreadStart;
 struct CgoThreadStart
 {
 	M *m;
 	G *g;
 	void (*fn)(void);
 };

 // Kick off new m's as needed (up to mcpumax).
 // Sched is locked.
 static void
 matchmg(void)
 {
 	G *gp;
 	M *mp;

 	if(m->mallocing || m->gcing)
 		return;

 	while(haveg() && canaddmcpu()) {
 		gp = gget();
 		if(gp == nil)
 			runtime·throw("gget inconsistency");

 		// Find the m that will run gp.
 		if((mp = mget(gp)) == nil)
 			mp = runtime·newm();
 		mnextg(mp, gp);
 	}
 }

 // Allocate a new m unassociated with any thread.
 M*
 runtime·allocm(void)
 {
 	M *mp;
 	static Type *mtype;  // The Go type M

 	if(mtype == nil) {
 		Eface e;
 		runtime·gc_m_ptr(&e);
 		mtype = ((PtrType*)e.type)->elem;
 	}

 	mp = runtime·cnew(mtype);
 	mcommoninit(mp);

 	if(runtime·iscgo || Windows)
 		mp->g0 = runtime·malg(-1);
 	else
 		mp->g0 = runtime·malg(8192);

 	return mp;
 }

 static M* lockextra(bool nilokay);
 static void unlockextra(M*);

 // needm is called when a cgo callback happens on a
 // thread without an m (a thread not created by Go).
 // In this case, needm is expected to find an m to use
 // and return with m, g initialized correctly.
 // Since m and g are not set now (likely nil, but see below)
 // needm is limited in what routines it can call. In particular
 // it can only call nosplit functions (textflag 7) and cannot
 // do any scheduling that requires an m.
 //
 // In order to avoid needing heavy lifting here, we adopt
 // the following strategy: there is a stack of available m's
 // that can be stolen. Using compare-and-swap
 // to pop from the stack has ABA races, so we simulate
 // a lock by doing an exchange (via casp) to steal the stack
 // head and replace the top pointer with MLOCKED (1).
 // This serves as a simple spin lock that we can use even
 // without an m. The thread that locks the stack in this way
 // unlocks the stack by storing a valid stack head pointer.
 //
 // In order to make sure that there is always an m structure
 // available to be stolen, we maintain the invariant that there
 // is always one more than needed. At the beginning of the
 // program (if cgo is in use) the list is seeded with a single m.
 // If needm finds that it has taken the last m off the list, its job
 // is - once it has installed its own m so that it can do things like
 // allocate memory - to create a spare m and put it on the list.
 //
 // Each of these extra m's also has a g0 and a curg that are
 // pressed into service as the scheduling stack and current
 // goroutine for the duration of the cgo callback.
 //
 // When the callback is done with the m, it calls dropm to
 // put the m back on the list.
 #pragma textflag 7
 void
 runtime·needm(byte x)
 {
 	M *mp;

 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
 	// that the extra list always contains or will soon contain
 	// at least one m.
 	mp = lockextra(false);

 	// Set needextram when we've just emptied the list,
 	// so that the eventual call into cgocallbackg will
 	// allocate a new m for the extra list. We delay the
 	// allocation until then so that it can be done
 	// after exitsyscall makes sure it is okay to be
 	// running at all (that is, there's no garbage collection
 	// running right now).
 	mp->needextram = mp->schedlink == nil;
 	unlockextra(mp->schedlink);

 	// Install m and g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
 	// how big the stack is, like we don't know how big any
 	// scheduling stack is, but we assume there's at least 32 kB,
 	// which is more than enough for us.
 	runtime·setmg(mp, mp->g0);
 	g->stackbase = (uintptr)(&x + 1024);
 	g->stackguard = (uintptr)(&x - 32*1024);

 	// On windows/386, we need to put an SEH frame (two words)
 	// somewhere on the current stack. We are called
 	// from needm, and we know there is some available
 	// space one word into the argument frame. Use that.
 	m->seh = (SEH*)((uintptr*)&x + 1);

 	// Initialize this thread to use the m.
 	runtime·asminit();
 	runtime·minit();
 }

 // newextram allocates an m and puts it on the extra list.
 // It is called with a working local m, so that it can do things
 // like call schedlock and allocate.
 void
 runtime·newextram(void)
 {
 	M *mp, *mnext;
 	G *gp;

 	// Scheduler protects allocation of new m's and g's.
 	// Create extra goroutine locked to extra m.
 	// The goroutine is the context in which the cgo callback will run.
 	// The sched.pc will never be returned to, but setting it to
 	// runtime.goexit makes clear to the traceback routines where
 	// the goroutine stack ends.
 	schedlock();
 	mp = runtime·allocm();
 	gp = runtime·malg(4096);
 	gp->sched.pc = (void*)runtime·goexit;
 	gp->sched.sp = gp->stackbase;
 	gp->sched.g = gp;
 	gp->status = Gsyscall;
 	mp->curg = gp;
 	mp->locked = LockInternal;
 	mp->lockedg = gp;
 	gp->lockedm = mp;
 	// put on allg for garbage collector
 	if(runtime·lastg == nil)
 		runtime·allg = gp;
 	else
 		runtime·lastg->alllink = gp;
 	runtime·lastg = gp;
 	schedunlock();

 	// Add m to the extra list.
 	mnext = lockextra(true);
 	mp->schedlink = mnext;
 	unlockextra(mp);
 }

 // dropm is called when a cgo callback has called needm but is now
 // done with the callback and returning back into the non-Go thread.
 // It puts the current m back onto the extra list.
 //
 // The main expense here is the call to signalstack to release the
 // m's signal stack, and then the call to needm on the next callback
 // from this thread. It is tempting to try to save the m for next time,
 // which would eliminate both these costs, but there might not be
 // a next time: the current thread (which Go does not control) might exit.
 // If we saved the m for that thread, there would be an m leak each time
 // such a thread exited. Instead, we acquire and release an m on each
 // call. These should typically not be scheduling operations, just a few
 // atomics, so the cost should be small.
 //
 // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
 // variable using pthread_key_create. Unlike the pthread keys we already use
 // on OS X, this dummy key would never be read by Go code. It would exist
 // only so that we could register at thread-exit-time destructor.
 // That destructor would put the m back onto the extra list.
 // This is purely a performance optimization. The current version,
 // in which dropm happens on each cgo call, is still correct too.
 // We may have to keep the current version on systems with cgo
 // but without pthreads, like Windows.
 void
 runtime·dropm(void)
 {
 	M *mp, *mnext;

 	// Undo whatever initialization minit did during needm.
 	runtime·unminit();

 	// Clear m and g, and return m to the extra list.
 	// After the call to setmg we can only call nosplit functions.
 	mp = m;
 	runtime·setmg(nil, nil);

 	mnext = lockextra(true);
 	mp->schedlink = mnext;
 	unlockextra(mp);
 }

 #define MLOCKED ((M*)1)

 // lockextra locks the extra list and returns the list head.
 // The caller must unlock the list by storing a new list head
 // to runtime.extram. If nilokay is true, then lockextra will
 // return a nil list head if that's what it finds. If nilokay is false,
 // lockextra will keep waiting until the list head is no longer nil.
 #pragma textflag 7
 static M*
 lockextra(bool nilokay)
 {
 	M *mp;
 	void (*yield)(void);

 	for(;;) {
 		mp = runtime·atomicloadp(&runtime·extram);
 		if(mp == MLOCKED) {
 			yield = runtime·osyield;
 			yield();
 			continue;
 		}
 		if(mp == nil && !nilokay) {
 			runtime·usleep(1);
 			continue;
 		}
 		if(!runtime·casp(&runtime·extram, mp, MLOCKED)) {
 			yield = runtime·osyield;
 			yield();
 			continue;
 		}
 		break;
 	}
 	return mp;
 }

 #pragma textflag 7
 static void
 unlockextra(M *mp)
 {
 	runtime·atomicstorep(&runtime·extram, mp);
 }


 // Create a new m.  It will start off with a call to runtime·mstart.
 M*
 runtime·newm(void)
 {
 	M *mp;

 	mp = runtime·allocm();

 	if(runtime·iscgo) {
 		CgoThreadStart ts;

 		if(libcgo_thread_start == nil)
 			runtime·throw("libcgo_thread_start missing");
 		ts.m = mp;
 		ts.g = mp->g0;
 		ts.fn = runtime·mstart;
 		runtime·asmcgocall(libcgo_thread_start, &ts);
 	} else {
 		runtime·newosproc(mp, mp->g0, (byte*)mp->g0->stackbase, runtime·mstart);
 	}

 	return mp;
 }

 // One round of scheduler: find a goroutine and run it.
 // The argument is the goroutine that was running before
 // schedule was called, or nil if this is the first call.
 // Never returns.
 static void
 schedule(G *gp)
 {
 	int32 hz;
 	uint32 v;

 	schedlock();
 	if(gp != nil) {
 		// Just finished running gp.
 		gp->m = nil;
 		runtime·sched.grunning--;

 		// atomic { mcpu-- }
 		v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
 		if(atomic_mcpu(v) > maxgomaxprocs)
 			runtime·throw("negative mcpu in scheduler");

 		switch(gp->status) {
 		case Grunnable:
 		case Gdead:
 			// Shouldn't have been running!
 			runtime·throw("bad gp->status in sched");
 		case Grunning:
 			gp->status = Grunnable;
 			gput(gp);
 			break;
 		case Gmoribund:
 			gp->status = Gdead;
 			if(gp->lockedm) {
 				gp->lockedm = nil;
 				m->lockedg = nil;
 				m->locked = 0;
 			}
 			gp->idlem = nil;
 			runtime·unwindstack(gp, nil);
 			gfput(gp);
 			if(--runtime·sched.gcount == 0)
 				runtime·exit(0);
 			break;
 		}
 		if(gp->readyonstop) {
 			gp->readyonstop = 0;
 			readylocked(gp);
 		}
 	} else if(m->helpgc) {
 		// Bootstrap m or new m started by starttheworld.
 		// atomic { mcpu-- }
 		v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
 		if(atomic_mcpu(v) > maxgomaxprocs)
 			runtime·throw("negative mcpu in scheduler");
 		// Compensate for increment in starttheworld().
 		runtime·sched.grunning--;
 		m->helpgc = 0;
 	} else if(m->nextg != nil) {
 		// New m started by matchmg.
 	} else {
 		runtime·throw("invalid m state in scheduler");
 	}

 	// Find (or wait for) g to run.  Unlocks runtime·sched.
 	gp = nextgandunlock();
 	gp->readyonstop = 0;
 	gp->status = Grunning;
 	m->curg = gp;
 	gp->m = m;

 	// Check whether the profiler needs to be turned on or off.
 	hz = runtime·sched.profilehz;
 	if(m->profilehz != hz)
 		runtime·resetcpuprofiler(hz);

 	if(gp->sched.pc == (byte*)runtime·goexit)  // kickoff
 		runtime·gogocallfn(&gp->sched, gp->fnstart);
 	runtime·gogo(&gp->sched, 0);
 }

 // Enter scheduler.  If g->status is Grunning,
 // re-queues g and runs everyone else who is waiting
 // before running g again.  If g->status is Gmoribund,
 // kills off g.
 // Cannot split stack because it is called from exitsyscall.
 // See comment below.
 #pragma textflag 7
 void
 runtime·gosched(void)
 {
 	if(m->locks != 0)
 		runtime·throw("gosched holding locks");
 	if(g == m->g0)
 		runtime·throw("gosched of g0");
 	runtime·mcall(schedule);
 }

 // Puts the current goroutine into a waiting state and unlocks the lock.
 // The goroutine can be made runnable again by calling runtime·ready(gp).
 void
 runtime·park(void (*unlockf)(Lock*), Lock *lock, int8 *reason)
 {
 	g->status = Gwaiting;
 	g->waitreason = reason;
 	if(unlockf)
 		unlockf(lock);
 	runtime·gosched();
 }

 // The goroutine g is about to enter a system call.
 // Record that it's not using the cpu anymore.
 // This is called only from the go syscall library and cgocall,
 // not from the low-level system calls used by the runtime.
 //
 // Entersyscall cannot split the stack: the runtime·gosave must
 // make g->sched refer to the caller's stack segment, because
 // entersyscall is going to return immediately after.
 // It's okay to call matchmg and notewakeup even after
 // decrementing mcpu, because we haven't released the
 // sched lock yet, so the garbage collector cannot be running.
 #pragma textflag 7
 void
 runtime·entersyscall(void)
 {
 	uint32 v;

 	if(m->profilehz > 0)
 		runtime·setprof(false);

 	// Leave SP around for gc and traceback.
 	runtime·gosave(&g->sched);
 	g->gcsp = g->sched.sp;
 	g->gcstack = g->stackbase;
 	g->gcguard = g->stackguard;
 	g->status = Gsyscall;
 	if(g->gcsp < g->gcguard-StackGuard || g->gcstack < g->gcsp) {
 		// runtime·printf("entersyscall inconsistent %p [%p,%p]\n",
 		//	g->gcsp, g->gcguard-StackGuard, g->gcstack);
 		runtime·throw("entersyscall");
 	}

 	// Fast path.
 	// The slow path inside the schedlock/schedunlock will get
 	// through without stopping if it does:
 	//	mcpu--
 	//	gwait not true
 	//	waitstop && mcpu <= mcpumax not true
 	// If we can do the same with a single atomic add,
 	// then we can skip the locks.
 	v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
 	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
 		return;

 	schedlock();
 	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_gwaiting(v)) {
 		matchmg();
 		v = runtime·atomicload(&runtime·sched.atomic);
 	}
 	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
 		runtime·notewakeup(&runtime·sched.stopped);
 	}

 	// Re-save sched in case one of the calls
 	// (notewakeup, matchmg) triggered something using it.
 	runtime·gosave(&g->sched);

 	schedunlock();
 }

 // The same as runtime·entersyscall(), but with a hint that the syscall is blocking.
 // The hint is ignored at the moment, and it's just a copy of runtime·entersyscall().
 #pragma textflag 7
 void
 runtime·entersyscallblock(void)
 {
 	uint32 v;

 	if(m->profilehz > 0)
 		runtime·setprof(false);

 	// Leave SP around for gc and traceback.
 	runtime·gosave(&g->sched);
 	g->gcsp = g->sched.sp;
 	g->gcstack = g->stackbase;
 	g->gcguard = g->stackguard;
 	g->status = Gsyscall;
 	if(g->gcsp < g->gcguard-StackGuard || g->gcstack < g->gcsp) {
 		// runtime·printf("entersyscall inconsistent %p [%p,%p]\n",
 		//	g->gcsp, g->gcguard-StackGuard, g->gcstack);
 		runtime·throw("entersyscall");
 	}

 	// Fast path.
 	// The slow path inside the schedlock/schedunlock will get
 	// through without stopping if it does:
 	//	mcpu--
 	//	gwait not true
 	//	waitstop && mcpu <= mcpumax not true
 	// If we can do the same with a single atomic add,
 	// then we can skip the locks.
 	v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
 	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
 		return;

 	schedlock();
 	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_gwaiting(v)) {
 		matchmg();
 		v = runtime·atomicload(&runtime·sched.atomic);
 	}
 	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
 		runtime·notewakeup(&runtime·sched.stopped);
 	}

 	// Re-save sched in case one of the calls
 	// (notewakeup, matchmg) triggered something using it.
 	runtime·gosave(&g->sched);

 	schedunlock();
 }

 // The goroutine g exited its system call.
 // Arrange for it to run on a cpu again.
 // This is called only from the go syscall library, not
 // from the low-level system calls used by the runtime.
 void
 runtime·exitsyscall(void)
 {
 	uint32 v;

 	// Fast path.
 	// If we can do the mcpu++ bookkeeping and
 	// find that we still have mcpu <= mcpumax, then we can
 	// start executing Go code immediately, without having to
 	// schedlock/schedunlock.
 	v = runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
 	if(m->profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		// There's a cpu for us, so we can run.
 		g->status = Grunning;
 		// Garbage collector isn't running (since we are),
 		// so okay to clear gcstack.
 		g->gcstack = (uintptr)nil;

 		if(m->profilehz > 0)
 			runtime·setprof(true);
 		return;
 	}

 	// Tell scheduler to put g back on the run queue:
 	// mostly equivalent to g->status = Grunning,
 	// but keeps the garbage collector from thinking
 	// that g is running right now, which it's not.
 	g->readyonstop = 1;

 	// All the cpus are taken.
 	// The scheduler will ready g and put this m to sleep.
 	// When the scheduler takes g away from m,
 	// it will undo the runtime·sched.mcpu++ above.
 	runtime·gosched();

 	// Gosched returned, so we're allowed to run now.
 	// Delete the gcstack information that we left for
 	// the garbage collector during the system call.
 	// Must wait until now because until gosched returns
 	// we don't know for sure that the garbage collector
 	// is not running.
 	g->gcstack = (uintptr)nil;
 }

 // Hook used by runtime·malg to call runtime·stackalloc on the
 // scheduler stack.  This exists because runtime·stackalloc insists
 // on being called on the scheduler stack, to avoid trying to grow
 // the stack while allocating a new stack segment.
 static void
 mstackalloc(G *gp)
 {
 	gp->param = runtime·stackalloc((uintptr)gp->param);
 	runtime·gogo(&gp->sched, 0);
 }

 // Allocate a new g, with a stack big enough for stacksize bytes.
 G*
 runtime·malg(int32 stacksize)
 {
 	G *newg;
 	byte *stk;

 	if(StackTop < sizeof(Stktop)) {
 		runtime·printf("runtime: SizeofStktop=%d, should be >=%d\n", (int32)StackTop, (int32)sizeof(Stktop));
 		runtime·throw("runtime: bad stack.h");
 	}

 	newg = runtime·malloc(sizeof(G));
 	if(stacksize >= 0) {
 		if(g == m->g0) {
 			// running on scheduler stack already.
 			stk = runtime·stackalloc(StackSystem + stacksize);
 		} else {
 			// have to call stackalloc on scheduler stack.
 			g->param = (void*)(StackSystem + stacksize);
 			runtime·mcall(mstackalloc);
 			stk = g->param;
 			g->param = nil;
 		}
 		newg->stack0 = (uintptr)stk;
 		newg->stackguard = (uintptr)stk + StackGuard;
 		newg->stackbase = (uintptr)stk + StackSystem + stacksize - sizeof(Stktop);
 		runtime·memclr((byte*)newg->stackbase, sizeof(Stktop));
 	}
 	return newg;
 }

 // Create a new g running fn with siz bytes of arguments.
 // Put it on the queue of g's waiting to run.
 // The compiler turns a go statement into a call to this.
 // Cannot split the stack because it assumes that the arguments
 // are available sequentially after &fn; they would not be
 // copied if a stack split occurred.  It's OK for this to call
 // functions that split the stack.
 #pragma textflag 7
 void
 runtime·newproc(int32 siz, FuncVal* fn, ...)
 {
 	byte *argp;

 	if(thechar == '5')
 		argp = (byte*)(&fn+2);  // skip caller's saved LR
 	else
 		argp = (byte*)(&fn+1);
 	runtime·newproc1(fn, argp, siz, 0, runtime·getcallerpc(&siz));
 }

 // Create a new g running fn with narg bytes of arguments starting
 // at argp and returning nret bytes of results.  callerpc is the
 // address of the go statement that created this.  The new g is put
 // on the queue of g's waiting to run.
 G*
 runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 {
 	byte *sp;
 	G *newg;
 	int32 siz;
 	uintptr racectx;

 //printf("newproc1 %p %p narg=%d nret=%d\n", fn, argp, narg, nret);
 	siz = narg + nret;
 	siz = (siz+7) & ~7;

 	// We could instead create a secondary stack frame
 	// and make it look like goexit was on the original but
 	// the call to the actual goroutine function was split.
 	// Not worth it: this is almost always an error.
 	if(siz > StackMin - 1024)
 		runtime·throw("runtime.newproc: function arguments too large for new goroutine");

 	if(raceenabled)
 		racectx = runtime·racegostart(callerpc);

 	schedlock();

 	if((newg = gfget()) != nil) {
 		if(newg->stackguard - StackGuard != newg->stack0)
 			runtime·throw("invalid stack in newg");
 	} else {
 		newg = runtime·malg(StackMin);
 		if(runtime·lastg == nil)
 			runtime·allg = newg;
 		else
 			runtime·lastg->alllink = newg;
 		runtime·lastg = newg;
 	}
 	newg->status = Gwaiting;
 	newg->waitreason = "new goroutine";

 	sp = (byte*)newg->stackbase;
 	sp -= siz;
 	runtime·memmove(sp, argp, narg);
 	if(thechar == '5') {
 		// caller's LR
 		sp -= sizeof(void*);
 		*(void**)sp = nil;
 	}

 	newg->sched.sp = (uintptr)sp;
 	newg->sched.pc = (byte*)runtime·goexit;
 	newg->sched.g = newg;
 	newg->fnstart = fn;
 	newg->gopc = (uintptr)callerpc;
 	if(raceenabled)
 		newg->racectx = racectx;

 	runtime·sched.gcount++;
 	newg->goid = ++runtime·sched.goidgen;

 	newprocreadylocked(newg);
 	schedunlock();

 	return newg;
 //printf(" goid=%d\n", newg->goid);
 }

 // Put on gfree list.  Sched must be locked.
 static void
 gfput(G *gp)
 {
 	if(gp->stackguard - StackGuard != gp->stack0)
 		runtime·throw("invalid stack in gfput");
 	gp->schedlink = runtime·sched.gfree;
 	runtime·sched.gfree = gp;
 }

 // Get from gfree list.  Sched must be locked.
 static G*
 gfget(void)
 {
 	G *gp;

 	gp = runtime·sched.gfree;
 	if(gp)
 		runtime·sched.gfree = gp->schedlink;
 	return gp;
 }

 void
 runtime·Breakpoint(void)
 {
 	runtime·breakpoint();
 }

 void
 runtime·Gosched(void)
 {
 	runtime·gosched();
 }

 // Implementation of runtime.GOMAXPROCS.
 // delete when scheduler is stronger
 int32
 runtime·gomaxprocsfunc(int32 n)
 {
 	int32 ret;
 	uint32 v;

 	schedlock();
 	ret = runtime·gomaxprocs;
 	if(n <= 0)
 		n = ret;
 	if(n > maxgomaxprocs)
 		n = maxgomaxprocs;
 	runtime·gomaxprocs = n;
 	if(runtime·gomaxprocs > 1)
 		runtime·singleproc = false;
  	if(runtime·gcwaiting != 0) {
  		if(atomic_mcpumax(runtime·sched.atomic) != 1)
  			runtime·throw("invalid mcpumax during gc");
 		schedunlock();
 		return ret;
 	}

 	setmcpumax(n);

 	// If there are now fewer allowed procs
 	// than procs running, stop.
 	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_mcpu(v) > n) {
 		schedunlock();
 		runtime·gosched();
 		return ret;
 	}
 	// handle more procs
 	matchmg();
 	schedunlock();
 	return ret;
 }

 static void
 LockOSThread(void)
 {
 	m->lockedg = g;
 	g->lockedm = m;
 }

 void
 runtime·LockOSThread(void)
 {
 	m->locked |= LockExternal;
 	LockOSThread();
 }

 void
 runtime·lockOSThread(void)
 {
 	m->locked += LockInternal;
 	LockOSThread();
 }

 static void
 UnlockOSThread(void)
 {
 	if(m->locked != 0)
 		return;
 	m->lockedg = nil;
 	g->lockedm = nil;
 }

 void
 runtime·UnlockOSThread(void)
 {
 	m->locked &= ~LockExternal;
 	UnlockOSThread();
 }

 void
 runtime·unlockOSThread(void)
 {
 	if(m->locked < LockInternal)
 		runtime·throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
 	m->locked -= LockInternal;
 	UnlockOSThread();
 }

 bool
 runtime·lockedOSThread(void)
 {
 	return g->lockedm != nil && m->lockedg != nil;
 }

 // for testing of callbacks
 void
 runtime·golockedOSThread(bool ret)
 {
 	ret = runtime·lockedOSThread();
 	FLUSH(&ret);
 }

 // for testing of wire, unwire
 void
 runtime·mid(uint32 ret)
 {
 	ret = m->id;
 	FLUSH(&ret);
 }

 void
 runtime·NumGoroutine(intgo ret)
 {
 	ret = runtime·gcount();
 	FLUSH(&ret);
 }

 int32
 runtime·gcount(void)
 {
 	G *gp;
 	int32 n, s;

 	n = 0;
 	runtime·lock(&runtime·sched);
 	for(gp = runtime·allg; gp; gp = gp->alllink) {
 		s = gp->status;
 		if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
 			n++;
 	}
 	runtime·unlock(&runtime·sched);
 	return n;
 }

 int32
 runtime·mcount(void)
 {
 	return runtime·sched.mcount;
 }

 void
 runtime·badmcall(void)  // called from assembly
 {
 	runtime·throw("runtime: mcall called on m->g0 stack");
 }

 void
 runtime·badmcall2(void)  // called from assembly
 {
 	runtime·throw("runtime: mcall function returned");
 }

 static struct {
 	Lock;
 	void (*fn)(uintptr*, int32);
 	int32 hz;
 	uintptr pcbuf[100];
 } prof;

 // Called if we receive a SIGPROF signal.
 void
 runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp)
 {
 	int32 n;

 	if(m == nil || m->mcache == nil)
 		return;
 	if(prof.fn == nil || prof.hz == 0)
 		return;

 	runtime·lock(&prof);
 	if(prof.fn == nil) {
 		runtime·unlock(&prof);
 		return;
 	}
 	n = runtime·gentraceback(pc, sp, lr, gp, 0, prof.pcbuf, nelem(prof.pcbuf));
 	if(n > 0)
 		prof.fn(prof.pcbuf, n);
 	runtime·unlock(&prof);
 }

 // Arrange to call fn with a traceback hz times a second.
 void
 runtime·setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 {
 	// Force sane arguments.
 	if(hz < 0)
 		hz = 0;
 	if(hz == 0)
 		fn = nil;
 	if(fn == nil)
 		hz = 0;

 	// Stop profiler on this cpu so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
 	// it would deadlock.
 	runtime·resetcpuprofiler(0);

 	runtime·lock(&prof);
 	prof.fn = fn;
 	prof.hz = hz;
 	runtime·unlock(&prof);
 	runtime·lock(&runtime·sched);
 	runtime·sched.profilehz = hz;
 	runtime·unlock(&runtime·sched);

 	if(hz != 0)
 		runtime·resetcpuprofiler(hz);
 }

 // Put g on local runnable queue.
 // TODO(dvyukov): consider using lock-free queue.
 static void
 runqput(P *p, G *gp)
 {
 	int32 h, t, s;

 	runtime·lock(p);
 retry:
 	h = p->runqhead;
 	t = p->runqtail;
 	s = p->runqsize;
 	if(t == h-1 || (h == 0 && t == s-1)) {
 		runqgrow(p);
 		goto retry;
 	}
 	p->runq[t++] = gp;
 	if(t == s)
 		t = 0;
 	p->runqtail = t;
 	runtime·unlock(p);
 }

 // Get g from local runnable queue.
 static G*
 runqget(P *p)
 {
 	G *gp;
 	int32 t, h, s;

 	if(p->runqhead == p->runqtail)
 		return nil;
 	runtime·lock(p);
 	h = p->runqhead;
 	t = p->runqtail;
 	s = p->runqsize;
 	if(t == h) {
 		runtime·unlock(p);
 		return nil;
 	}
 	gp = p->runq[h++];
 	if(h == s)
 		h = 0;
 	p->runqhead = h;
 	runtime·unlock(p);
 	return gp;
 }

 // Grow local runnable queue.
 // TODO(dvyukov): consider using fixed-size array
 // and transfer excess to the global list (local queue can grow way too big).
 static void
 runqgrow(P *p)
 {
 	G **q;
 	int32 s, t, h, t2;

 	h = p->runqhead;
 	t = p->runqtail;
 	s = p->runqsize;
 	t2 = 0;
 	q = runtime·malloc(2*s*sizeof(*q));
 	while(t != h) {
 		q[t2++] = p->runq[h++];
 		if(h == s)
 			h = 0;
 	}
 	runtime·free(p->runq);
 	p->runq = q;
 	p->runqhead = 0;
 	p->runqtail = t2;
 	p->runqsize = 2*s;
 }

 // Steal half of elements from local runnable queue of p2
 // and put onto local runnable queue of p.
 // Returns one of the stolen elements (or nil if failed).
 static G*
 runqsteal(P *p, P *p2)
 {
 	G *gp, *gp1;
 	int32 t, h, s, t2, h2, s2, c, i;

 	if(p2->runqhead == p2->runqtail)
 		return nil;
 	// sort locks to prevent deadlocks
 	if(p < p2)
 		runtime·lock(p);
 	runtime·lock(p2);
 	if(p2->runqhead == p2->runqtail) {
 		runtime·unlock(p2);
 		if(p < p2)
 			runtime·unlock(p);
 		return nil;
 	}
 	if(p >= p2)
 		runtime·lock(p);
 	// now we've locked both queues and know the victim is not empty
 	h = p->runqhead;
 	t = p->runqtail;
 	s = p->runqsize;
 	h2 = p2->runqhead;
 	t2 = p2->runqtail;
 	s2 = p2->runqsize;
 	gp = p2->runq[h2++];  // return value
 	if(h2 == s2)
 		h2 = 0;
 	// steal roughly half
 	if(t2 > h2)
 		c = (t2 - h2) / 2;
 	else
 		c = (s2 - h2 + t2) / 2;
 	// copy
 	for(i = 0; i != c; i++) {
 		// the target queue is full?
 		if(t == h-1 || (h == 0 && t == s-1))
 			break;
 		// the victim queue is empty?
 		if(t2 == h2)
 			break;
 		gp1 = p2->runq[h2++];
 		if(h2 == s2)
 			h2 = 0;
 		p->runq[t++] = gp1;
 		if(t == s)
 			t = 0;
 	}
 	p->runqtail = t;
 	p2->runqhead = h2;
 	runtime·unlock(p2);
 	runtime·unlock(p);
 	return gp;
 }

 void
 runtime·testSchedLocalQueue(void)
 {
 	P p;
 	G gs[1000];
 	int32 i, j;

 	runtime·memclr((byte*)&p, sizeof(p));
 	p.runqsize = 1;
 	p.runqhead = 0;
 	p.runqtail = 0;
 	p.runq = runtime·malloc(p.runqsize*sizeof(*p.runq));

 	for(i = 0; i < nelem(gs); i++) {
 		if(runqget(&p) != nil)
 			runtime·throw("runq is not empty initially");
 		for(j = 0; j < i; j++)
 			runqput(&p, &gs[i]);
 		for(j = 0; j < i; j++) {
 			if(runqget(&p) != &gs[i]) {
 				runtime·printf("bad element at iter %d/%d\n", i, j);
 				runtime·throw("bad element");
 			}
 		}
 		if(runqget(&p) != nil)
 			runtime·throw("runq is not empty afterwards");
 	}
 }

 void
 runtime·testSchedLocalQueueSteal(void)
 {
 	P p1, p2;
 	G gs[1000], *gp;
 	int32 i, j, s;

 	runtime·memclr((byte*)&p1, sizeof(p1));
 	p1.runqsize = 1;
 	p1.runqhead = 0;
 	p1.runqtail = 0;
 	p1.runq = runtime·malloc(p1.runqsize*sizeof(*p1.runq));

 	runtime·memclr((byte*)&p2, sizeof(p2));
 	p2.runqsize = nelem(gs);
 	p2.runqhead = 0;
 	p2.runqtail = 0;
 	p2.runq = runtime·malloc(p2.runqsize*sizeof(*p2.runq));

 	for(i = 0; i < nelem(gs); i++) {
 		for(j = 0; j < i; j++) {
 			gs[j].sig = 0;
 			runqput(&p1, &gs[j]);
 		}
 		gp = runqsteal(&p2, &p1);
 		s = 0;
 		if(gp) {
 			s++;
 			gp->sig++;
 		}
 		while(gp = runqget(&p2)) {
 			s++;
 			gp->sig++;
 		}
 		while(gp = runqget(&p1))
 			gp->sig++;
 		for(j = 0; j < i; j++) {
 			if(gs[j].sig != 1) {
 				runtime·printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
 				runtime·throw("bad element");
 			}
 		}
 		if(s != i/2 && s != i/2+1) {
 			runtime·printf("bad steal %d, want %d or %d, iter %d\n",
 				s, i/2, i/2+1, i);
 			runtime·throw("bad steal");
 		}
 	}
 }