[dev.cc] runtime: convert memory allocator and garbage collector to Go The conversion was done with an automated tool and then modified only as necessary to make it compile and run. [This CL is part of the removal of C code from package runtime. See golang.org/s/dev.cc for an overview.] LGTM=r R=r CC=austin, dvyukov, golang-codereviews, iant, khr https://golang.org/cl/167540043

commit: 1e2d2f09470a2be58a98420b4cecae731b156ee8 [log] [tgz]
author: Russ Cox <rsc@golang.org> Tue Nov 11 17:05:02 2014 -0500
committer: Russ Cox <rsc@golang.org> Tue Nov 11 17:05:02 2014 -0500
tree: 4e787523cc5b7e166ff675e1d3da6b1da2207170
parent: d98553a72782efb2d96c6d6f5e12869826a56779 [diff]
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 0049701..bb0110f 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go

@@ -26,7 +26,7 @@
 	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
 		gothrow("makechan: bad alignment")
 	}
-	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (maxmem-hchanSize)/uintptr(elem.size)) {
+	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/uintptr(elem.size)) {
 		panic("makechan: size out of range")
 	}
 

diff --git a/src/runtime/heapdump.c b/src/runtime/heapdump.c
deleted file mode 100644
index eddbc1d..0000000
--- a/src/runtime/heapdump.c
+++ /dev/null

@@ -1,864 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implementation of runtime/debug.WriteHeapDump.  Writes all
-// objects in the heap plus additional info (roots, threads,
-// finalizers, etc.) to a file.
-
-// The format of the dumped file is described at
-// http://code.google.com/p/go-wiki/wiki/heapdump14
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-#include "mgc0.h"
-#include "type.h"
-#include "typekind.h"
-#include "funcdata.h"
-#include "zaexperiment.h"
-#include "textflag.h"
-
-extern byte runtime·data[];
-extern byte runtime·edata[];
-extern byte runtime·bss[];
-extern byte runtime·ebss[];
-
-enum {
-	FieldKindEol = 0,
-	FieldKindPtr = 1,
-	FieldKindIface = 2,
-	FieldKindEface = 3,
-
-	TagEOF = 0,
-	TagObject = 1,
-	TagOtherRoot = 2,
-	TagType = 3,
-	TagGoRoutine = 4,
-	TagStackFrame = 5,
-	TagParams = 6,
-	TagFinalizer = 7,
-	TagItab = 8,
-	TagOSThread = 9,
-	TagMemStats = 10,
-	TagQueuedFinalizer = 11,
-	TagData = 12,
-	TagBss = 13,
-	TagDefer = 14,
-	TagPanic = 15,
-	TagMemProf = 16,
-	TagAllocSample = 17,
-};
-
-static uintptr* playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg);
-static void dumpfields(BitVector bv);
-static void dumpbvtypes(BitVector *bv, byte *base);
-static BitVector makeheapobjbv(byte *p, uintptr size);
-
-// fd to write the dump to.
-static uintptr	dumpfd;
-
-#pragma dataflag NOPTR /* tmpbuf not a heap pointer at least */
-static byte	*tmpbuf;
-static uintptr	tmpbufsize;
-
-// buffer of pending write data
-enum {
-	BufSize = 4096,
-};
-#pragma dataflag NOPTR
-static byte buf[BufSize];
-static uintptr nbuf;
-
-static void
-write(byte *data, uintptr len)
-{
-	if(len + nbuf <= BufSize) {
-		runtime·memmove(buf + nbuf, data, len);
-		nbuf += len;
-		return;
-	}
-	runtime·write(dumpfd, buf, nbuf);
-	if(len >= BufSize) {
-		runtime·write(dumpfd, data, len);
-		nbuf = 0;
-	} else {
-		runtime·memmove(buf, data, len);
-		nbuf = len;
-	}
-}
-
-static void
-flush(void)
-{
-	runtime·write(dumpfd, buf, nbuf);
-	nbuf = 0;
-}
-
-// Cache of types that have been serialized already.
-// We use a type's hash field to pick a bucket.
-// Inside a bucket, we keep a list of types that
-// have been serialized so far, most recently used first.
-// Note: when a bucket overflows we may end up
-// serializing a type more than once.  That's ok.
-enum {
-	TypeCacheBuckets = 256, // must be a power of 2
-	TypeCacheAssoc = 4,
-};
-typedef struct TypeCacheBucket TypeCacheBucket;
-struct TypeCacheBucket {
-	Type *t[TypeCacheAssoc];
-};
-#pragma dataflag NOPTR /* only initialized and used while world is stopped */
-static TypeCacheBucket typecache[TypeCacheBuckets];
-
-// dump a uint64 in a varint format parseable by encoding/binary
-static void
-dumpint(uint64 v)
-{
-	byte buf[10];
-	int32 n;
-	n = 0;
-	while(v >= 0x80) {
-		buf[n++] = v | 0x80;
-		v >>= 7;
-	}
-	buf[n++] = v;
-	write(buf, n);
-}
-
-static void
-dumpbool(bool b)
-{
-	dumpint(b ? 1 : 0);
-}
-
-// dump varint uint64 length followed by memory contents
-static void
-dumpmemrange(byte *data, uintptr len)
-{
-	dumpint(len);
-	write(data, len);
-}
-
-static void
-dumpstr(String s)
-{
-	dumpmemrange(s.str, s.len);
-}
-
-static void
-dumpcstr(int8 *c)
-{
-	dumpmemrange((byte*)c, runtime·findnull((byte*)c));
-}
-
-// dump information for a type
-static void
-dumptype(Type *t)
-{
-	TypeCacheBucket *b;
-	int32 i, j;
-
-	if(t == nil) {
-		return;
-	}
-
-	// If we've definitely serialized the type before,
-	// no need to do it again.
-	b = &typecache[t->hash & (TypeCacheBuckets-1)];
-	if(t == b->t[0]) return;
-	for(i = 1; i < TypeCacheAssoc; i++) {
-		if(t == b->t[i]) {
-			// Move-to-front
-			for(j = i; j > 0; j--) {
-				b->t[j] = b->t[j-1];
-			}
-			b->t[0] = t;
-			return;
-		}
-	}
-	// Might not have been dumped yet.  Dump it and
-	// remember we did so.
-	for(j = TypeCacheAssoc-1; j > 0; j--) {
-		b->t[j] = b->t[j-1];
-	}
-	b->t[0] = t;
-	
-	// dump the type
-	dumpint(TagType);
-	dumpint((uintptr)t);
-	dumpint(t->size);
-	if(t->x == nil || t->x->pkgPath == nil || t->x->name == nil) {
-		dumpstr(*t->string);
-	} else {
-		dumpint(t->x->pkgPath->len + 1 + t->x->name->len);
-		write(t->x->pkgPath->str, t->x->pkgPath->len);
-		write((byte*)".", 1);
-		write(t->x->name->str, t->x->name->len);
-	}
-	dumpbool((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0);
-}
-
-// dump an object
-static void
-dumpobj(byte *obj, uintptr size, BitVector bv)
-{
-	dumpbvtypes(&bv, obj);
-	dumpint(TagObject);
-	dumpint((uintptr)obj);
-	dumpmemrange(obj, size);
-	dumpfields(bv);
-}
-
-static void
-dumpotherroot(int8 *description, byte *to)
-{
-	dumpint(TagOtherRoot);
-	dumpcstr(description);
-	dumpint((uintptr)to);
-}
-
-static void
-dumpfinalizer(byte *obj, FuncVal *fn, Type* fint, PtrType *ot)
-{
-	dumpint(TagFinalizer);
-	dumpint((uintptr)obj);
-	dumpint((uintptr)fn);
-	dumpint((uintptr)fn->fn);
-	dumpint((uintptr)fint);
-	dumpint((uintptr)ot);
-}
-
-typedef struct ChildInfo ChildInfo;
-struct ChildInfo {
-	// Information passed up from the callee frame about
-	// the layout of the outargs region.
-	uintptr argoff;     // where the arguments start in the frame
-	uintptr arglen;     // size of args region
-	BitVector args;    // if args.n >= 0, pointer map of args region
-
-	byte *sp;           // callee sp
-	uintptr depth;      // depth in call stack (0 == most recent)
-};
-
-// dump kinds & offsets of interesting fields in bv
-static void
-dumpbv(BitVector *bv, uintptr offset)
-{
-	uintptr i;
-
-	for(i = 0; i < bv->n; i += BitsPerPointer) {
-		switch(bv->bytedata[i/8] >> i%8 & 3) {
-		case BitsDead:
-			// BitsDead has already been processed in makeheapobjbv.
-			// We should only see it in stack maps, in which case we should continue processing.
-			break;
-		case BitsScalar:
-			break;
-		case BitsPointer:
-			dumpint(FieldKindPtr);
-			dumpint(offset + i / BitsPerPointer * PtrSize);
-			break;
-		case BitsMultiWord:
-			switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) {
-			default:
-				runtime·throw("unexpected garbage collection bits");
-			case BitsIface:
-				dumpint(FieldKindIface);
-				dumpint(offset + i / BitsPerPointer * PtrSize);
-				i += BitsPerPointer;
-				break;
-			case BitsEface:
-				dumpint(FieldKindEface);
-				dumpint(offset + i / BitsPerPointer * PtrSize);
-				i += BitsPerPointer;
-				break;
-			}
-		}
-	}
-}
-
-static bool
-dumpframe(Stkframe *s, void *arg)
-{
-	Func *f;
-	ChildInfo *child;
-	uintptr pc, off, size;
-	int32 pcdata;
-	StackMap *stackmap;
-	int8 *name;
-	BitVector bv;
-
-	child = (ChildInfo*)arg;
-	f = s->fn;
-
-	// Figure out what we can about our stack map
-	pc = s->pc;
-	if(pc != f->entry)
-		pc--;
-	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, pc);
-	if(pcdata == -1) {
-		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function.  It is likely that we are looking
-		// at the function prologue, assume so and hope for the best.
-		pcdata = 0;
-	}
-	stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
-
-	// Dump any types we will need to resolve Efaces.
-	if(child->args.n >= 0)
-		dumpbvtypes(&child->args, (byte*)s->sp + child->argoff);
-	if(stackmap != nil && stackmap->n > 0) {
-		bv = runtime·stackmapdata(stackmap, pcdata);
-		dumpbvtypes(&bv, (byte*)(s->varp - bv.n / BitsPerPointer * PtrSize));
-	} else {
-		bv.n = -1;
-	}
-
-	// Dump main body of stack frame.
-	dumpint(TagStackFrame);
-	dumpint(s->sp); // lowest address in frame
-	dumpint(child->depth); // # of frames deep on the stack
-	dumpint((uintptr)child->sp); // sp of child, or 0 if bottom of stack
-	dumpmemrange((byte*)s->sp, s->fp - s->sp);  // frame contents
-	dumpint(f->entry);
-	dumpint(s->pc);
-	dumpint(s->continpc);
-	name = runtime·funcname(f);
-	if(name == nil)
-		name = "unknown function";
-	dumpcstr(name);
-
-	// Dump fields in the outargs section
-	if(child->args.n >= 0) {
-		dumpbv(&child->args, child->argoff);
-	} else {
-		// conservative - everything might be a pointer
-		for(off = child->argoff; off < child->argoff + child->arglen; off += PtrSize) {
-			dumpint(FieldKindPtr);
-			dumpint(off);
-		}
-	}
-
-	// Dump fields in the local vars section
-	if(stackmap == nil) {
-		// No locals information, dump everything.
-		for(off = child->arglen; off < s->varp - s->sp; off += PtrSize) {
-			dumpint(FieldKindPtr);
-			dumpint(off);
-		}
-	} else if(stackmap->n < 0) {
-		// Locals size information, dump just the locals.
-		size = -stackmap->n;
-		for(off = s->varp - size - s->sp; off <  s->varp - s->sp; off += PtrSize) {
-			dumpint(FieldKindPtr);
-			dumpint(off);
-		}
-	} else if(stackmap->n > 0) {
-		// Locals bitmap information, scan just the pointers in
-		// locals.
-		dumpbv(&bv, s->varp - bv.n / BitsPerPointer * PtrSize - s->sp);
-	}
-	dumpint(FieldKindEol);
-
-	// Record arg info for parent.
-	child->argoff = s->argp - s->fp;
-	child->arglen = s->arglen;
-	child->sp = (byte*)s->sp;
-	child->depth++;
-	stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
-	if(stackmap != nil)
-		child->args = runtime·stackmapdata(stackmap, pcdata);
-	else
-		child->args.n = -1;
-	return true;
-}
-
-static void
-dumpgoroutine(G *gp)
-{
-	uintptr sp, pc, lr;
-	ChildInfo child;
-	Defer *d;
-	Panic *p;
-	bool (*fn)(Stkframe*, void*);
-
-	if(gp->syscallsp != (uintptr)nil) {
-		sp = gp->syscallsp;
-		pc = gp->syscallpc;
-		lr = 0;
-	} else {
-		sp = gp->sched.sp;
-		pc = gp->sched.pc;
-		lr = gp->sched.lr;
-	}
-
-	dumpint(TagGoRoutine);
-	dumpint((uintptr)gp);
-	dumpint((uintptr)sp);
-	dumpint(gp->goid);
-	dumpint(gp->gopc);
-	dumpint(runtime·readgstatus(gp));
-	dumpbool(gp->issystem);
-	dumpbool(false);  // isbackground
-	dumpint(gp->waitsince);
-	dumpstr(gp->waitreason);
-	dumpint((uintptr)gp->sched.ctxt);
-	dumpint((uintptr)gp->m);
-	dumpint((uintptr)gp->defer);
-	dumpint((uintptr)gp->panic);
-
-	// dump stack
-	child.args.n = -1;
-	child.arglen = 0;
-	child.sp = nil;
-	child.depth = 0;
-	fn = dumpframe;
-	runtime·gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, &fn, &child, 0);
-
-	// dump defer & panic records
-	for(d = gp->defer; d != nil; d = d->link) {
-		dumpint(TagDefer);
-		dumpint((uintptr)d);
-		dumpint((uintptr)gp);
-		dumpint((uintptr)d->argp);
-		dumpint((uintptr)d->pc);
-		dumpint((uintptr)d->fn);
-		dumpint((uintptr)d->fn->fn);
-		dumpint((uintptr)d->link);
-	}
-	for (p = gp->panic; p != nil; p = p->link) {
-		dumpint(TagPanic);
-		dumpint((uintptr)p);
-		dumpint((uintptr)gp);
-		dumpint((uintptr)p->arg.type);
-		dumpint((uintptr)p->arg.data);
-		dumpint(0); // was p->defer, no longer recorded
-		dumpint((uintptr)p->link);
-	}
-}
-
-static void
-dumpgs(void)
-{
-	G *gp;
-	uint32 i;
-	uint32 status;
-
-	// goroutines & stacks
-	for(i = 0; i < runtime·allglen; i++) {
-		gp = runtime·allg[i];
-		status = runtime·readgstatus(gp); // The world is stopped so gp will not be in a scan state.
-		switch(status){
-		default:
-			runtime·printf("runtime: unexpected G.status %d\n", status);
-			runtime·throw("dumpgs in STW - bad status");
-		case Gdead:
-			break;
-		case Grunnable:
-		case Gsyscall:
-		case Gwaiting:
-			dumpgoroutine(gp);
-			break;
-		}
-	}
-}
-
-static void
-finq_callback(FuncVal *fn, byte *obj, uintptr nret, Type *fint, PtrType *ot)
-{
-	dumpint(TagQueuedFinalizer);
-	dumpint((uintptr)obj);
-	dumpint((uintptr)fn);
-	dumpint((uintptr)fn->fn);
-	dumpint((uintptr)fint);
-	dumpint((uintptr)ot);
-	USED(&nret);
-}
-
-
-static void
-dumproots(void)
-{
-	MSpan *s, **allspans;
-	uint32 spanidx;
-	Special *sp;
-	SpecialFinalizer *spf;
-	byte *p;
-
-	// data segment
-	dumpbvtypes(&runtime·gcdatamask, runtime·data);
-	dumpint(TagData);
-	dumpint((uintptr)runtime·data);
-	dumpmemrange(runtime·data, runtime·edata - runtime·data);
-	dumpfields(runtime·gcdatamask);
-
-	// bss segment
-	dumpbvtypes(&runtime·gcbssmask, runtime·bss);
-	dumpint(TagBss);
-	dumpint((uintptr)runtime·bss);
-	dumpmemrange(runtime·bss, runtime·ebss - runtime·bss);
-	dumpfields(runtime·gcbssmask);
-
-	// MSpan.types
-	allspans = runtime·mheap.allspans;
-	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
-		s = allspans[spanidx];
-		if(s->state == MSpanInUse) {
-			// Finalizers
-			for(sp = s->specials; sp != nil; sp = sp->next) {
-				if(sp->kind != KindSpecialFinalizer)
-					continue;
-				spf = (SpecialFinalizer*)sp;
-				p = (byte*)((s->start << PageShift) + spf->special.offset);
-				dumpfinalizer(p, spf->fn, spf->fint, spf->ot);
-			}
-		}
-	}
-
-	// Finalizer queue
-	runtime·iterate_finq(finq_callback);
-}
-
-// Bit vector of free marks.	
-// Needs to be as big as the largest number of objects per span.	
-#pragma dataflag NOPTR
-static byte free[PageSize/8];	
-
-static void
-dumpobjs(void)
-{
-	uintptr i, j, size, n;
-	MSpan *s;
-	MLink *l;
-	byte *p;
-
-	for(i = 0; i < runtime·mheap.nspan; i++) {
-		s = runtime·mheap.allspans[i];
-		if(s->state != MSpanInUse)
-			continue;
-		p = (byte*)(s->start << PageShift);
-		size = s->elemsize;
-		n = (s->npages << PageShift) / size;
-		if(n > nelem(free))	
-			runtime·throw("free array doesn't have enough entries");	
-		for(l = s->freelist; l != nil; l = l->next)
-			free[((byte*)l - p) / size] = true;	
-		for(j = 0; j < n; j++, p += size) {
-			if(free[j]) {	
-				free[j] = false;	
-				continue;	
-			}
-			dumpobj(p, size, makeheapobjbv(p, size));
-		}
-	}
-}
-
-static void
-dumpparams(void)
-{
-	byte *x;
-
-	dumpint(TagParams);
-	x = (byte*)1;
-	if(*(byte*)&x == 1)
-		dumpbool(false); // little-endian ptrs
-	else
-		dumpbool(true); // big-endian ptrs
-	dumpint(PtrSize);
-	dumpint((uintptr)runtime·mheap.arena_start);
-	dumpint((uintptr)runtime·mheap.arena_used);
-	dumpint(thechar);
-	dumpcstr(GOEXPERIMENT);
-	dumpint(runtime·ncpu);
-}
-
-static void
-itab_callback(Itab *tab)
-{
-	Type *t;
-
-	t = tab->type;
-	// Dump a map from itab* to the type of its data field.
-	// We want this map so we can deduce types of interface referents.
-	if((t->kind & KindDirectIface) == 0) {
-		// indirect - data slot is a pointer to t.
-		dumptype(t->ptrto);
-		dumpint(TagItab);
-		dumpint((uintptr)tab);
-		dumpint((uintptr)t->ptrto);
-	} else if((t->kind & KindNoPointers) == 0) {
-		// t is pointer-like - data slot is a t.
-		dumptype(t);
-		dumpint(TagItab);
-		dumpint((uintptr)tab);
-		dumpint((uintptr)t);
-	} else {
-		// Data slot is a scalar.  Dump type just for fun.
-		// With pointer-only interfaces, this shouldn't happen.
-		dumptype(t);
-		dumpint(TagItab);
-		dumpint((uintptr)tab);
-		dumpint((uintptr)t);
-	}
-}
-
-static void
-dumpitabs(void)
-{
-	void (*fn)(Itab*);
-	
-	fn = itab_callback;
-	runtime·iterate_itabs(&fn);
-}
-
-static void
-dumpms(void)
-{
-	M *mp;
-
-	for(mp = runtime·allm; mp != nil; mp = mp->alllink) {
-		dumpint(TagOSThread);
-		dumpint((uintptr)mp);
-		dumpint(mp->id);
-		dumpint(mp->procid);
-	}
-}
-
-static void
-dumpmemstats(void)
-{
-	int32 i;
-
-	dumpint(TagMemStats);
-	dumpint(mstats.alloc);
-	dumpint(mstats.total_alloc);
-	dumpint(mstats.sys);
-	dumpint(mstats.nlookup);
-	dumpint(mstats.nmalloc);
-	dumpint(mstats.nfree);
-	dumpint(mstats.heap_alloc);
-	dumpint(mstats.heap_sys);
-	dumpint(mstats.heap_idle);
-	dumpint(mstats.heap_inuse);
-	dumpint(mstats.heap_released);
-	dumpint(mstats.heap_objects);
-	dumpint(mstats.stacks_inuse);
-	dumpint(mstats.stacks_sys);
-	dumpint(mstats.mspan_inuse);
-	dumpint(mstats.mspan_sys);
-	dumpint(mstats.mcache_inuse);
-	dumpint(mstats.mcache_sys);
-	dumpint(mstats.buckhash_sys);
-	dumpint(mstats.gc_sys);
-	dumpint(mstats.other_sys);
-	dumpint(mstats.next_gc);
-	dumpint(mstats.last_gc);
-	dumpint(mstats.pause_total_ns);
-	for(i = 0; i < 256; i++)
-		dumpint(mstats.pause_ns[i]);
-	dumpint(mstats.numgc);
-}
-
-static void
-dumpmemprof_callback(Bucket *b, uintptr nstk, uintptr *stk, uintptr size, uintptr allocs, uintptr frees)
-{
-	uintptr i, pc;
-	Func *f;
-	byte buf[20];
-	String file;
-	int32 line;
-
-	dumpint(TagMemProf);
-	dumpint((uintptr)b);
-	dumpint(size);
-	dumpint(nstk);
-	for(i = 0; i < nstk; i++) {
-		pc = stk[i];
-		f = runtime·findfunc(pc);
-		if(f == nil) {
-			runtime·snprintf(buf, sizeof(buf), "%X", (uint64)pc);
-			dumpcstr((int8*)buf);
-			dumpcstr("?");
-			dumpint(0);
-		} else {
-			dumpcstr(runtime·funcname(f));
-			// TODO: Why do we need to back up to a call instruction here?
-			// Maybe profiler should do this.
-			if(i > 0 && pc > f->entry) {
-				if(thechar == '6' || thechar == '8')
-					pc--;
-				else
-					pc -= 4; // arm, etc
-			}
-			line = runtime·funcline(f, pc, &file);
-			dumpstr(file);
-			dumpint(line);
-		}
-	}
-	dumpint(allocs);
-	dumpint(frees);
-}
-
-static void
-dumpmemprof(void)
-{
-	MSpan *s, **allspans;
-	uint32 spanidx;
-	Special *sp;
-	SpecialProfile *spp;
-	byte *p;
-	void (*fn)(Bucket*, uintptr, uintptr*, uintptr, uintptr, uintptr);
-	
-	fn = dumpmemprof_callback;
-	runtime·iterate_memprof(&fn);
-
-	allspans = runtime·mheap.allspans;
-	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
-		s = allspans[spanidx];
-		if(s->state != MSpanInUse)
-			continue;
-		for(sp = s->specials; sp != nil; sp = sp->next) {
-			if(sp->kind != KindSpecialProfile)
-				continue;
-			spp = (SpecialProfile*)sp;
-			p = (byte*)((s->start << PageShift) + spp->special.offset);
-			dumpint(TagAllocSample);
-			dumpint((uintptr)p);
-			dumpint((uintptr)spp->b);
-		}
-	}
-}
-
-static void
-mdump(void)
-{
-	byte *hdr;
-	uintptr i;
-	MSpan *s;
-
-	// make sure we're done sweeping
-	for(i = 0; i < runtime·mheap.nspan; i++) {
-		s = runtime·mheap.allspans[i];
-		if(s->state == MSpanInUse)
-			runtime·MSpan_EnsureSwept(s);
-	}
-
-	runtime·memclr((byte*)&typecache[0], sizeof(typecache));
-	hdr = (byte*)"go1.4 heap dump\n";
-	write(hdr, runtime·findnull(hdr));
-	dumpparams();
-	dumpitabs();
-	dumpobjs();
-	dumpgs();
-	dumpms();
-	dumproots();
-	dumpmemstats();
-	dumpmemprof();
-	dumpint(TagEOF);
-	flush();
-}
-
-void
-runtime·writeheapdump_m(void)
-{
-	uintptr fd;
-	
-	fd = g->m->scalararg[0];
-	g->m->scalararg[0] = 0;
-
-	runtime·casgstatus(g->m->curg, Grunning, Gwaiting);
-	g->waitreason = runtime·gostringnocopy((byte*)"dumping heap");
-
-	// Update stats so we can dump them.
-	// As a side effect, flushes all the MCaches so the MSpan.freelist
-	// lists contain all the free objects.
-	runtime·updatememstats(nil);
-
-	// Set dump file.
-	dumpfd = fd;
-
-	// Call dump routine.
-	mdump();
-
-	// Reset dump file.
-	dumpfd = 0;
-	if(tmpbuf != nil) {
-		runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
-		tmpbuf = nil;
-		tmpbufsize = 0;
-	}
-
-	runtime·casgstatus(g->m->curg, Gwaiting, Grunning);
-}
-
-// dumpint() the kind & offset of each field in an object.
-static void
-dumpfields(BitVector bv)
-{
-	dumpbv(&bv, 0);
-	dumpint(FieldKindEol);
-}
-
-// The heap dump reader needs to be able to disambiguate
-// Eface entries.  So it needs to know every type that might
-// appear in such an entry.  The following routine accomplishes that.
-
-// Dump all the types that appear in the type field of
-// any Eface described by this bit vector.
-static void
-dumpbvtypes(BitVector *bv, byte *base)
-{
-	uintptr i;
-
-	for(i = 0; i < bv->n; i += BitsPerPointer) {
-		if((bv->bytedata[i/8] >> i%8 & 3) != BitsMultiWord)
-			continue;
-		switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) {
-		default:
-			runtime·throw("unexpected garbage collection bits");
-		case BitsIface:
-			i += BitsPerPointer;
-			break;
-		case BitsEface:
-			dumptype(*(Type**)(base + i / BitsPerPointer * PtrSize));
-			i += BitsPerPointer;
-			break;
-		}
-	}
-}
-
-static BitVector
-makeheapobjbv(byte *p, uintptr size)
-{
-	uintptr off, nptr, i;
-	byte shift, *bitp, bits;
-	bool mw;
-
-	// Extend the temp buffer if necessary.
-	nptr = size/PtrSize;
-	if(tmpbufsize < nptr*BitsPerPointer/8+1) {
-		if(tmpbuf != nil)
-			runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
-		tmpbufsize = nptr*BitsPerPointer/8+1;
-		tmpbuf = runtime·sysAlloc(tmpbufsize, &mstats.other_sys);
-		if(tmpbuf == nil)
-			runtime·throw("heapdump: out of memory");
-	}
-
-	// Copy and compact the bitmap.
-	mw = false;
-	for(i = 0; i < nptr; i++) {
-		off = (uintptr*)(p + i*PtrSize) - (uintptr*)runtime·mheap.arena_start;
-		bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
-		shift = (off % wordsPerBitmapByte) * gcBits;
-		bits = (*bitp >> (shift + 2)) & BitsMask;
-		if(!mw && bits == BitsDead)
-			break;  // end of heap object
-		mw = !mw && bits == BitsMultiWord;
-		tmpbuf[i*BitsPerPointer/8] &= ~(BitsMask<<((i*BitsPerPointer)%8));
-		tmpbuf[i*BitsPerPointer/8] |= bits<<((i*BitsPerPointer)%8);
-	}
-	return (BitVector){i*BitsPerPointer, tmpbuf};
-}

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
new file mode 100644
index 0000000..01e70a3
--- /dev/null
+++ b/src/runtime/heapdump.go

@@ -0,0 +1,733 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of runtime/debug.WriteHeapDump.  Writes all
+// objects in the heap plus additional info (roots, threads,
+// finalizers, etc.) to a file.
+
+// The format of the dumped file is described at
+// http://code.google.com/p/go-wiki/wiki/heapdump14
+
+package runtime
+
+import "unsafe"
+
+const (
+	fieldKindEol       = 0
+	fieldKindPtr       = 1
+	fieldKindIface     = 2
+	fieldKindEface     = 3
+	tagEOF             = 0
+	tagObject          = 1
+	tagOtherRoot       = 2
+	tagType            = 3
+	tagGoroutine       = 4
+	tagStackFrame      = 5
+	tagParams          = 6
+	tagFinalizer       = 7
+	tagItab            = 8
+	tagOSThread        = 9
+	tagMemStats        = 10
+	tagQueuedFinalizer = 11
+	tagData            = 12
+	tagBSS             = 13
+	tagDefer           = 14
+	tagPanic           = 15
+	tagMemProf         = 16
+	tagAllocSample     = 17
+)
+
+var dumpfd uintptr // fd to write the dump to.
+var tmpbuf []byte
+
+// buffer of pending write data
+const (
+	bufSize = 4096
+)
+
+var buf [bufSize]byte
+var nbuf uintptr
+
+func dwrite(data unsafe.Pointer, len uintptr) {
+	if len == 0 {
+		return
+	}
+	if nbuf+len <= bufSize {
+		copy(buf[nbuf:], (*[bufSize]byte)(data)[:len])
+		nbuf += len
+		return
+	}
+
+	write(dumpfd, (unsafe.Pointer)(&buf), int32(nbuf))
+	if len >= bufSize {
+		write(dumpfd, data, int32(len))
+		nbuf = 0
+	} else {
+		copy(buf[:], (*[bufSize]byte)(data)[:len])
+		nbuf = len
+	}
+}
+
+func dwritebyte(b byte) {
+	dwrite(unsafe.Pointer(&b), 1)
+}
+
+func flush() {
+	write(dumpfd, (unsafe.Pointer)(&buf), int32(nbuf))
+	nbuf = 0
+}
+
+// Cache of types that have been serialized already.
+// We use a type's hash field to pick a bucket.
+// Inside a bucket, we keep a list of types that
+// have been serialized so far, most recently used first.
+// Note: when a bucket overflows we may end up
+// serializing a type more than once.  That's ok.
+const (
+	typeCacheBuckets = 256
+	typeCacheAssoc   = 4
+)
+
+type typeCacheBucket struct {
+	t [typeCacheAssoc]*_type
+}
+
+var typecache [typeCacheBuckets]typeCacheBucket
+
+// dump a uint64 in a varint format parseable by encoding/binary
+func dumpint(v uint64) {
+	var buf [10]byte
+	var n int
+	for v >= 0x80 {
+		buf[n] = byte(v | 0x80)
+		n++
+		v >>= 7
+	}
+	buf[n] = byte(v)
+	n++
+	dwrite(unsafe.Pointer(&buf), uintptr(n))
+}
+
+func dumpbool(b bool) {
+	if b {
+		dumpint(1)
+	} else {
+		dumpint(0)
+	}
+}
+
+// dump varint uint64 length followed by memory contents
+func dumpmemrange(data unsafe.Pointer, len uintptr) {
+	dumpint(uint64(len))
+	dwrite(data, len)
+}
+
+func dumpslice(b []byte) {
+	dumpint(uint64(len(b)))
+	if len(b) > 0 {
+		dwrite(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	}
+}
+
+func dumpstr(s string) {
+	sp := (*stringStruct)(unsafe.Pointer(&s))
+	dumpmemrange(sp.str, uintptr(sp.len))
+}
+
+// dump information for a type
+func dumptype(t *_type) {
+	if t == nil {
+		return
+	}
+
+	// If we've definitely serialized the type before,
+	// no need to do it again.
+	b := &typecache[t.hash&(typeCacheBuckets-1)]
+	if t == b.t[0] {
+		return
+	}
+	for i := 1; i < typeCacheAssoc; i++ {
+		if t == b.t[i] {
+			// Move-to-front
+			for j := i; j > 0; j-- {
+				b.t[j] = b.t[j-1]
+			}
+			b.t[0] = t
+			return
+		}
+	}
+
+	// Might not have been dumped yet.  Dump it and
+	// remember we did so.
+	for j := typeCacheAssoc - 1; j > 0; j-- {
+		b.t[j] = b.t[j-1]
+	}
+	b.t[0] = t
+
+	// dump the type
+	dumpint(tagType)
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	dumpint(uint64(t.size))
+	if t.x == nil || t.x.pkgpath == nil || t.x.name == nil {
+		dumpstr(*t._string)
+	} else {
+		pkgpath := (*stringStruct)(unsafe.Pointer(&t.x.pkgpath))
+		name := (*stringStruct)(unsafe.Pointer(&t.x.name))
+		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
+		dwrite(pkgpath.str, uintptr(pkgpath.len))
+		dwritebyte('.')
+		dwrite(name.str, uintptr(name.len))
+	}
+	dumpbool(t.kind&kindDirectIface == 0 || t.kind&kindNoPointers == 0)
+}
+
+// dump an object
+func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
+	dumpbvtypes(&bv, obj)
+	dumpint(tagObject)
+	dumpint(uint64(uintptr(obj)))
+	dumpmemrange(obj, size)
+	dumpfields(bv)
+}
+
+func dumpotherroot(description string, to unsafe.Pointer) {
+	dumpint(tagOtherRoot)
+	dumpstr(description)
+	dumpint(uint64(uintptr(to)))
+}
+
+func dumpfinalizer(obj unsafe.Pointer, fn *funcval, fint *_type, ot *ptrtype) {
+	dumpint(tagFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fint))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+type childInfo struct {
+	// Information passed up from the callee frame about
+	// the layout of the outargs region.
+	argoff uintptr   // where the arguments start in the frame
+	arglen uintptr   // size of args region
+	args   bitvector // if args.n >= 0, pointer map of args region
+	sp     *uint8    // callee sp
+	depth  uintptr   // depth in call stack (0 == most recent)
+}
+
+// dump kinds & offsets of interesting fields in bv
+func dumpbv(cbv *bitvector, offset uintptr) {
+	bv := gobv(*cbv)
+	for i := uintptr(0); i < uintptr(bv.n); i += bitsPerPointer {
+		switch bv.bytedata[i/8] >> (i % 8) & 3 {
+		default:
+			gothrow("unexpected pointer bits")
+		case _BitsDead:
+			// BitsDead has already been processed in makeheapobjbv.
+			// We should only see it in stack maps, in which case we should continue processing.
+		case _BitsScalar:
+			// ok
+		case _BitsPointer:
+			dumpint(fieldKindPtr)
+			dumpint(uint64(offset + i/_BitsPerPointer*ptrSize))
+		}
+	}
+}
+
+func dumpframe(s *stkframe, arg unsafe.Pointer) bool {
+	child := (*childInfo)(arg)
+	f := s.fn
+
+	// Figure out what we can about our stack map
+	pc := s.pc
+	if pc != f.entry {
+		pc--
+	}
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc)
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function.  It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+	stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+
+	// Dump any types we will need to resolve Efaces.
+	if child.args.n >= 0 {
+		dumpbvtypes(&child.args, unsafe.Pointer(s.sp+child.argoff))
+	}
+	var bv bitvector
+	if stkmap != nil && stkmap.n > 0 {
+		bv = stackmapdata(stkmap, pcdata)
+		dumpbvtypes(&bv, unsafe.Pointer(s.varp-uintptr(bv.n/_BitsPerPointer*ptrSize)))
+	} else {
+		bv.n = -1
+	}
+
+	// Dump main body of stack frame.
+	dumpint(tagStackFrame)
+	dumpint(uint64(s.sp))                              // lowest address in frame
+	dumpint(uint64(child.depth))                       // # of frames deep on the stack
+	dumpint(uint64(uintptr(unsafe.Pointer(child.sp)))) // sp of child, or 0 if bottom of stack
+	dumpmemrange(unsafe.Pointer(s.sp), s.fp-s.sp)      // frame contents
+	dumpint(uint64(f.entry))
+	dumpint(uint64(s.pc))
+	dumpint(uint64(s.continpc))
+	name := gofuncname(f)
+	if name == "" {
+		name = "unknown function"
+	}
+	dumpstr(name)
+
+	// Dump fields in the outargs section
+	if child.args.n >= 0 {
+		dumpbv(&child.args, child.argoff)
+	} else {
+		// conservative - everything might be a pointer
+		for off := child.argoff; off < child.argoff+child.arglen; off += ptrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	}
+
+	// Dump fields in the local vars section
+	if stkmap == nil {
+		// No locals information, dump everything.
+		for off := child.arglen; off < s.varp-s.sp; off += ptrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	} else if stkmap.n < 0 {
+		// Locals size information, dump just the locals.
+		size := uintptr(-stkmap.n)
+		for off := s.varp - size - s.sp; off < s.varp-s.sp; off += ptrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	} else if stkmap.n > 0 {
+		// Locals bitmap information, scan just the pointers in
+		// locals.
+		dumpbv(&bv, s.varp-uintptr(bv.n)/_BitsPerPointer*ptrSize-s.sp)
+	}
+	dumpint(fieldKindEol)
+
+	// Record arg info for parent.
+	child.argoff = s.argp - s.fp
+	child.arglen = s.arglen
+	child.sp = (*uint8)(unsafe.Pointer(s.sp))
+	child.depth++
+	stkmap = (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+	if stkmap != nil {
+		child.args = stackmapdata(stkmap, pcdata)
+	} else {
+		child.args.n = -1
+	}
+	return true
+}
+
+func dumpgoroutine(gp *g) {
+	var sp, pc, lr uintptr
+	if gp.syscallsp != 0 {
+		sp = gp.syscallsp
+		pc = gp.syscallpc
+		lr = 0
+	} else {
+		sp = gp.sched.sp
+		pc = gp.sched.pc
+		lr = gp.sched.lr
+	}
+
+	dumpint(tagGoroutine)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+	dumpint(uint64(sp))
+	dumpint(uint64(gp.goid))
+	dumpint(uint64(gp.gopc))
+	dumpint(uint64(readgstatus(gp)))
+	dumpbool(gp.issystem)
+	dumpbool(false) // isbackground
+	dumpint(uint64(gp.waitsince))
+	dumpstr(gp.waitreason)
+	dumpint(uint64(uintptr(gp.sched.ctxt)))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp.m))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._defer))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._panic))))
+
+	// dump stack
+	var child childInfo
+	child.args.n = -1
+	child.arglen = 0
+	child.sp = nil
+	child.depth = 0
+	gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, dumpframe, noescape(unsafe.Pointer(&child)), 0)
+
+	// dump defer & panic records
+	for d := gp._defer; d != nil; d = d.link {
+		dumpint(tagDefer)
+		dumpint(uint64(uintptr(unsafe.Pointer(d))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		dumpint(uint64(d.argp))
+		dumpint(uint64(d.pc))
+		dumpint(uint64(uintptr(unsafe.Pointer(d.fn))))
+		dumpint(uint64(uintptr(unsafe.Pointer(d.fn.fn))))
+		dumpint(uint64(uintptr(unsafe.Pointer(d.link))))
+	}
+	for p := gp._panic; p != nil; p = p.link {
+		dumpint(tagPanic)
+		dumpint(uint64(uintptr(unsafe.Pointer(p))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		eface := (*eface)(unsafe.Pointer(&p.arg))
+		dumpint(uint64(uintptr(unsafe.Pointer(eface._type))))
+		dumpint(uint64(uintptr(unsafe.Pointer(eface.data))))
+		dumpint(0) // was p->defer, no longer recorded
+		dumpint(uint64(uintptr(unsafe.Pointer(p.link))))
+	}
+}
+
+func dumpgs() {
+	// goroutines & stacks
+	for i := 0; uintptr(i) < allglen; i++ {
+		gp := allgs[i]
+		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
+		switch status {
+		default:
+			print("runtime: unexpected G.status ", hex(status), "\n")
+			gothrow("dumpgs in STW - bad status")
+		case _Gdead:
+			// ok
+		case _Grunnable,
+			_Gsyscall,
+			_Gwaiting:
+			dumpgoroutine(gp)
+		}
+	}
+}
+
+func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, ot *ptrtype) {
+	dumpint(tagQueuedFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fint))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+func dumproots() {
+	// data segment
+	dumpbvtypes(&gcdatamask, unsafe.Pointer(&data))
+	dumpint(tagData)
+	dumpint(uint64(uintptr(unsafe.Pointer(&data))))
+	dumpmemrange(unsafe.Pointer(&data), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
+	dumpfields(gcdatamask)
+
+	// bss segment
+	dumpbvtypes(&gcbssmask, unsafe.Pointer(&bss))
+	dumpint(tagBSS)
+	dumpint(uint64(uintptr(unsafe.Pointer(&bss))))
+	dumpmemrange(unsafe.Pointer(&bss), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
+	dumpfields(gcbssmask)
+
+	// MSpan.types
+	allspans := h_allspans
+	for spanidx := uint32(0); spanidx < mheap_.nspan; spanidx++ {
+		s := allspans[spanidx]
+		if s.state == _MSpanInUse {
+			// Finalizers
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				p := unsafe.Pointer((uintptr(s.start) << _PageShift) + uintptr(spf.special.offset))
+				dumpfinalizer(p, spf.fn, spf.fint, spf.ot)
+			}
+		}
+	}
+
+	// Finalizer queue
+	iterate_finq(finq_callback)
+}
+
+// Bit vector of free marks.
+// Needs to be as big as the largest number of objects per span.
+var freemark [_PageSize / 8]bool
+
+func dumpobjs() {
+	for i := uintptr(0); i < uintptr(mheap_.nspan); i++ {
+		s := h_allspans[i]
+		if s.state != _MSpanInUse {
+			continue
+		}
+		p := uintptr(s.start << _PageShift)
+		size := s.elemsize
+		n := (s.npages << _PageShift) / size
+		if n > uintptr(len(freemark)) {
+			gothrow("freemark array doesn't have enough entries")
+		}
+		for l := s.freelist; l != nil; l = l.next {
+			freemark[(uintptr(unsafe.Pointer(l))-p)/size] = true
+		}
+		for j := uintptr(0); j < n; j, p = j+1, p+size {
+			if freemark[j] {
+				freemark[j] = false
+				continue
+			}
+			dumpobj(unsafe.Pointer(p), size, makeheapobjbv(p, size))
+		}
+	}
+}
+
+func dumpparams() {
+	dumpint(tagParams)
+	x := uintptr(1)
+	if *(*byte)(unsafe.Pointer(&x)) == 1 {
+		dumpbool(false) // little-endian ptrs
+	} else {
+		dumpbool(true) // big-endian ptrs
+	}
+	dumpint(ptrSize)
+	dumpint(uint64(mheap_.arena_start))
+	dumpint(uint64(mheap_.arena_used))
+	dumpint(thechar)
+	dumpstr(goexperiment)
+	dumpint(uint64(ncpu))
+}
+
+func itab_callback(tab *itab) {
+	t := tab._type
+	// Dump a map from itab* to the type of its data field.
+	// We want this map so we can deduce types of interface referents.
+	if t.kind&kindDirectIface == 0 {
+		// indirect - data slot is a pointer to t.
+		dumptype(t.ptrto)
+		dumpint(tagItab)
+		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
+		dumpint(uint64(uintptr(unsafe.Pointer(t.ptrto))))
+	} else if t.kind&kindNoPointers == 0 {
+		// t is pointer-like - data slot is a t.
+		dumptype(t)
+		dumpint(tagItab)
+		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
+		dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	} else {
+		// Data slot is a scalar.  Dump type just for fun.
+		// With pointer-only interfaces, this shouldn't happen.
+		dumptype(t)
+		dumpint(tagItab)
+		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
+		dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	}
+}
+
+func dumpitabs() {
+	iterate_itabs(itab_callback)
+}
+
+func dumpms() {
+	for mp := allm; mp != nil; mp = mp.alllink {
+		dumpint(tagOSThread)
+		dumpint(uint64(uintptr(unsafe.Pointer(mp))))
+		dumpint(uint64(mp.id))
+		dumpint(mp.procid)
+	}
+}
+
+func dumpmemstats() {
+	dumpint(tagMemStats)
+	dumpint(memstats.alloc)
+	dumpint(memstats.total_alloc)
+	dumpint(memstats.sys)
+	dumpint(memstats.nlookup)
+	dumpint(memstats.nmalloc)
+	dumpint(memstats.nfree)
+	dumpint(memstats.heap_alloc)
+	dumpint(memstats.heap_sys)
+	dumpint(memstats.heap_idle)
+	dumpint(memstats.heap_inuse)
+	dumpint(memstats.heap_released)
+	dumpint(memstats.heap_objects)
+	dumpint(memstats.stacks_inuse)
+	dumpint(memstats.stacks_sys)
+	dumpint(memstats.mspan_inuse)
+	dumpint(memstats.mspan_sys)
+	dumpint(memstats.mcache_inuse)
+	dumpint(memstats.mcache_sys)
+	dumpint(memstats.buckhash_sys)
+	dumpint(memstats.gc_sys)
+	dumpint(memstats.other_sys)
+	dumpint(memstats.next_gc)
+	dumpint(memstats.last_gc)
+	dumpint(memstats.pause_total_ns)
+	for i := 0; i < 256; i++ {
+		dumpint(memstats.pause_ns[i])
+	}
+	dumpint(uint64(memstats.numgc))
+}
+
+func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
+	stk := (*[100000]uintptr)(unsafe.Pointer(pstk))
+	dumpint(tagMemProf)
+	dumpint(uint64(uintptr(unsafe.Pointer(b))))
+	dumpint(uint64(size))
+	dumpint(uint64(nstk))
+	for i := uintptr(0); i < nstk; i++ {
+		pc := stk[i]
+		f := findfunc(pc)
+		if f == nil {
+			var buf [64]byte
+			n := len(buf)
+			n--
+			buf[n] = ')'
+			if pc == 0 {
+				n--
+				buf[n] = '0'
+			} else {
+				for pc > 0 {
+					n--
+					buf[n] = "0123456789abcdef"[pc&15]
+					pc >>= 4
+				}
+			}
+			n--
+			buf[n] = 'x'
+			n--
+			buf[n] = '0'
+			n--
+			buf[n] = '('
+			dumpslice(buf[n:])
+			dumpstr("?")
+			dumpint(0)
+		} else {
+			dumpstr(gofuncname(f))
+			if i > 0 && pc > f.entry {
+				pc--
+			}
+			var file string
+			line := funcline(f, pc, &file)
+			dumpstr(file)
+			dumpint(uint64(line))
+		}
+	}
+	dumpint(uint64(allocs))
+	dumpint(uint64(frees))
+}
+
+func dumpmemprof() {
+	iterate_memprof(dumpmemprof_callback)
+	allspans := h_allspans
+	for spanidx := uint32(0); spanidx < mheap_.nspan; spanidx++ {
+		s := allspans[spanidx]
+		if s.state != _MSpanInUse {
+			continue
+		}
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialProfile {
+				continue
+			}
+			spp := (*specialprofile)(unsafe.Pointer(sp))
+			p := uintptr(s.start<<_PageShift) + uintptr(spp.special.offset)
+			dumpint(tagAllocSample)
+			dumpint(uint64(p))
+			dumpint(uint64(uintptr(unsafe.Pointer(spp.b))))
+		}
+	}
+}
+
+var dumphdr = []byte("go1.4 heap dump\n")
+
+func mdump() {
+	// make sure we're done sweeping
+	for i := uintptr(0); i < uintptr(mheap_.nspan); i++ {
+		s := h_allspans[i]
+		if s.state == _MSpanInUse {
+			mSpan_EnsureSwept(s)
+		}
+	}
+	memclr(unsafe.Pointer(&typecache), unsafe.Sizeof(typecache))
+	dwrite(unsafe.Pointer(&dumphdr[0]), uintptr(len(dumphdr)))
+	dumpparams()
+	dumpitabs()
+	dumpobjs()
+	dumpgs()
+	dumpms()
+	dumproots()
+	dumpmemstats()
+	dumpmemprof()
+	dumpint(tagEOF)
+	flush()
+}
+
+func writeheapdump_m() {
+	_g_ := getg()
+	fd := _g_.m.scalararg[0]
+	_g_.m.scalararg[0] = 0
+
+	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
+	_g_.waitreason = "dumping heap"
+
+	// Update stats so we can dump them.
+	// As a side effect, flushes all the MCaches so the MSpan.freelist
+	// lists contain all the free objects.
+	updatememstats(nil)
+
+	// Set dump file.
+	dumpfd = fd
+
+	// Call dump routine.
+	mdump()
+
+	// Reset dump file.
+	dumpfd = 0
+	if tmpbuf != nil {
+		sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		tmpbuf = nil
+	}
+
+	casgstatus(_g_.m.curg, _Gwaiting, _Grunning)
+}
+
+// dumpint() the kind & offset of each field in an object.
+func dumpfields(bv bitvector) {
+	dumpbv(&bv, 0)
+	dumpint(fieldKindEol)
+}
+
+// The heap dump reader needs to be able to disambiguate
+// Eface entries.  So it needs to know every type that might
+// appear in such an entry.  The following routine accomplishes that.
+// TODO(rsc, khr): Delete - no longer possible.
+
+// Dump all the types that appear in the type field of
+// any Eface described by this bit vector.
+func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
+}
+
+func makeheapobjbv(p uintptr, size uintptr) bitvector {
+	// Extend the temp buffer if necessary.
+	nptr := size / ptrSize
+	if uintptr(len(tmpbuf)) < nptr*_BitsPerPointer/8+1 {
+		if tmpbuf != nil {
+			sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		}
+		n := nptr*_BitsPerPointer/8 + 1
+		p := sysAlloc(n, &memstats.other_sys)
+		if p == nil {
+			gothrow("heapdump: out of memory")
+		}
+		tmpbuf = (*[1 << 30]byte)(p)[:n]
+	}
+	// Copy and compact the bitmap.
+	var i uintptr
+	for i = 0; i < nptr; i++ {
+		off := (p + i*ptrSize - mheap_.arena_start) / ptrSize
+		bitp := (*uint8)(unsafe.Pointer(mheap_.arena_start - off/wordsPerBitmapByte - 1))
+		shift := uint8((off % wordsPerBitmapByte) * gcBits)
+		bits := (*bitp >> (shift + 2)) & _BitsMask
+		if bits == _BitsDead {
+			break // end of heap object
+		}
+		tmpbuf[i*_BitsPerPointer/8] &^= (_BitsMask << ((i * _BitsPerPointer) % 8))
+		tmpbuf[i*_BitsPerPointer/8] |= bits << ((i * _BitsPerPointer) % 8)
+	}
+	return bitvector{int32(i * _BitsPerPointer), &tmpbuf[0]}
+}

diff --git a/src/runtime/malloc.c b/src/runtime/malloc.c
deleted file mode 100644
index b79c30b..0000000
--- a/src/runtime/malloc.c
+++ /dev/null

@@ -1,396 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// See malloc.h for overview.
-//
-// TODO(rsc): double-check stats.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-#include "type.h"
-#include "typekind.h"
-#include "race.h"
-#include "stack.h"
-#include "textflag.h"
-
-// Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
-#pragma dataflag NOPTR
-MHeap runtime·mheap;
-#pragma dataflag NOPTR
-MStats runtime·memstats;
-
-int32
-runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
-{
-	uintptr n, i;
-	byte *p;
-	MSpan *s;
-
-	g->m->mcache->local_nlookup++;
-	if (sizeof(void*) == 4 && g->m->mcache->local_nlookup >= (1<<30)) {
-		// purge cache stats to prevent overflow
-		runtime·lock(&runtime·mheap.lock);
-		runtime·purgecachedstats(g->m->mcache);
-		runtime·unlock(&runtime·mheap.lock);
-	}
-
-	s = runtime·MHeap_LookupMaybe(&runtime·mheap, v);
-	if(sp)
-		*sp = s;
-	if(s == nil) {
-		if(base)
-			*base = nil;
-		if(size)
-			*size = 0;
-		return 0;
-	}
-
-	p = (byte*)((uintptr)s->start<<PageShift);
-	if(s->sizeclass == 0) {
-		// Large object.
-		if(base)
-			*base = p;
-		if(size)
-			*size = s->npages<<PageShift;
-		return 1;
-	}
-
-	n = s->elemsize;
-	if(base) {
-		i = ((byte*)v - p)/n;
-		*base = p + i*n;
-	}
-	if(size)
-		*size = n;
-
-	return 1;
-}
-
-#pragma textflag NOSPLIT
-void
-runtime·purgecachedstats(MCache *c)
-{
-	MHeap *h;
-	int32 i;
-
-	// Protected by either heap or GC lock.
-	h = &runtime·mheap;
-	mstats.heap_alloc += c->local_cachealloc;
-	c->local_cachealloc = 0;
-	mstats.tinyallocs += c->local_tinyallocs;
-	c->local_tinyallocs = 0;
-	mstats.nlookup += c->local_nlookup;
-	c->local_nlookup = 0;
-	h->largefree += c->local_largefree;
-	c->local_largefree = 0;
-	h->nlargefree += c->local_nlargefree;
-	c->local_nlargefree = 0;
-	for(i=0; i<nelem(c->local_nsmallfree); i++) {
-		h->nsmallfree[i] += c->local_nsmallfree[i];
-		c->local_nsmallfree[i] = 0;
-	}
-}
-
-// Size of the trailing by_size array differs between Go and C,
-// and all data after by_size is local to C, not exported to Go.
-// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
-// sizeof_C_MStats is what C thinks about size of Go struct.
-uintptr runtime·sizeof_C_MStats = offsetof(MStats, by_size[61]);
-
-#define MaxArena32 (2U<<30)
-
-// For use by Go. If it were a C enum it would be made available automatically,
-// but the value of MaxMem is too large for enum.
-uintptr runtime·maxmem = MaxMem;
-
-void
-runtime·mallocinit(void)
-{
-	byte *p, *p1;
-	uintptr arena_size, bitmap_size, spans_size, p_size;
-	extern byte runtime·end[];
-	uintptr limit;
-	uint64 i;
-	bool reserved;
-
-	p = nil;
-	p_size = 0;
-	arena_size = 0;
-	bitmap_size = 0;
-	spans_size = 0;
-	reserved = false;
-
-	// for 64-bit build
-	USED(p);
-	USED(p_size);
-	USED(arena_size);
-	USED(bitmap_size);
-	USED(spans_size);
-
-	runtime·InitSizes();
-
-	if(runtime·class_to_size[TinySizeClass] != TinySize)
-		runtime·throw("bad TinySizeClass");
-
-	// limit = runtime·memlimit();
-	// See https://code.google.com/p/go/issues/detail?id=5049
-	// TODO(rsc): Fix after 1.1.
-	limit = 0;
-
-	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found.  The arena begins with a bitmap large
-	// enough to hold 4 bits per allocated word.
-	if(sizeof(void*) == 8 && (limit == 0 || limit > (1<<30))) {
-		// On a 64-bit machine, allocate from a single contiguous reservation.
-		// 128 GB (MaxMem) should be big enough for now.
-		//
-		// The code will work with the reservation at any address, but ask
-		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
-		// Allocating a 128 GB region takes away 37 bits, and the amd64
-		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
-		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
-		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
-		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
-		// UTF-8 sequences, and they are otherwise as far away from 
-		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
-		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
-		// on OS X during thread allocations.  0x00c0 causes conflicts with
-		// AddressSanitizer which reserves all memory up to 0x0100.
-		// These choices are both for debuggability and to reduce the
-		// odds of the conservative garbage collector not collecting memory
-		// because some non-pointer block of memory had a bit pattern
-		// that matched a memory address.
-		//
-		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
-		// but it hardly matters: e0 00 is not valid UTF-8 either.
-		//
-		// If this fails we fall back to the 32 bit memory mechanism
-		arena_size = MaxMem;
-		bitmap_size = arena_size / (sizeof(void*)*8/4);
-		spans_size = arena_size / PageSize * sizeof(runtime·mheap.spans[0]);
-		spans_size = ROUND(spans_size, PageSize);
-		for(i = 0; i <= 0x7f; i++) {
-			p = (void*)(i<<40 | 0x00c0ULL<<32);
-			p_size = bitmap_size + spans_size + arena_size + PageSize;
-			p = runtime·SysReserve(p, p_size, &reserved);
-			if(p != nil)
-				break;
-		}
-	}
-	if (p == nil) {
-		// On a 32-bit machine, we can't typically get away
-		// with a giant virtual address space reservation.
-		// Instead we map the memory information bitmap
-		// immediately after the data segment, large enough
-		// to handle another 2GB of mappings (256 MB),
-		// along with a reservation for another 512 MB of memory.
-		// When that gets used up, we'll start asking the kernel
-		// for any memory anywhere and hope it's in the 2GB
-		// following the bitmap (presumably the executable begins
-		// near the bottom of memory, so we'll have to use up
-		// most of memory before the kernel resorts to giving out
-		// memory before the beginning of the text segment).
-		//
-		// Alternatively we could reserve 512 MB bitmap, enough
-		// for 4GB of mappings, and then accept any memory the
-		// kernel threw at us, but normally that's a waste of 512 MB
-		// of address space, which is probably too much in a 32-bit world.
-		bitmap_size = MaxArena32 / (sizeof(void*)*8/4);
-		arena_size = 512<<20;
-		spans_size = MaxArena32 / PageSize * sizeof(runtime·mheap.spans[0]);
-		if(limit > 0 && arena_size+bitmap_size+spans_size > limit) {
-			bitmap_size = (limit / 9) & ~((1<<PageShift) - 1);
-			arena_size = bitmap_size * 8;
-			spans_size = arena_size / PageSize * sizeof(runtime·mheap.spans[0]);
-		}
-		spans_size = ROUND(spans_size, PageSize);
-
-		// SysReserve treats the address we ask for, end, as a hint,
-		// not as an absolute requirement.  If we ask for the end
-		// of the data segment but the operating system requires
-		// a little more space before we can start allocating, it will
-		// give out a slightly higher pointer.  Except QEMU, which
-		// is buggy, as usual: it won't adjust the pointer upward.
-		// So adjust it upward a little bit ourselves: 1/4 MB to get
-		// away from the running binary image and then round up
-		// to a MB boundary.
-		p = (byte*)ROUND((uintptr)runtime·end + (1<<18), 1<<20);
-		p_size = bitmap_size + spans_size + arena_size + PageSize;
-		p = runtime·SysReserve(p, p_size, &reserved);
-		if(p == nil)
-			runtime·throw("runtime: cannot reserve arena virtual address space");
-	}
-
-	// PageSize can be larger than OS definition of page size,
-	// so SysReserve can give us a PageSize-unaligned pointer.
-	// To overcome this we ask for PageSize more and round up the pointer.
-	p1 = (byte*)ROUND((uintptr)p, PageSize);
-
-	runtime·mheap.spans = (MSpan**)p1;
-	runtime·mheap.bitmap = p1 + spans_size;
-	runtime·mheap.arena_start = p1 + spans_size + bitmap_size;
-	runtime·mheap.arena_used = runtime·mheap.arena_start;
-	runtime·mheap.arena_end = p + p_size;
-	runtime·mheap.arena_reserved = reserved;
-
-	if(((uintptr)runtime·mheap.arena_start & (PageSize-1)) != 0)
-		runtime·throw("misrounded allocation in mallocinit");
-
-	// Initialize the rest of the allocator.	
-	runtime·MHeap_Init(&runtime·mheap);
-	g->m->mcache = runtime·allocmcache();
-}
-
-void*
-runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
-{
-	byte *p, *p_end;
-	uintptr p_size;
-	bool reserved;
-
-	if(n > h->arena_end - h->arena_used) {
-		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
-		// Reserve some more space.
-		byte *new_end;
-
-		p_size = ROUND(n + PageSize, 256<<20);
-		new_end = h->arena_end + p_size;
-		if(new_end <= h->arena_start + MaxArena32) {
-			// TODO: It would be bad if part of the arena
-			// is reserved and part is not.
-			p = runtime·SysReserve(h->arena_end, p_size, &reserved);
-			if(p == h->arena_end) {
-				h->arena_end = new_end;
-				h->arena_reserved = reserved;
-			}
-			else if(p+p_size <= h->arena_start + MaxArena32) {
-				// Keep everything page-aligned.
-				// Our pages are bigger than hardware pages.
-				h->arena_end = p+p_size;
-				h->arena_used = p + (-(uintptr)p&(PageSize-1));
-				h->arena_reserved = reserved;
-			} else {
-				uint64 stat;
-				stat = 0;
-				runtime·SysFree(p, p_size, &stat);
-			}
-		}
-	}
-	if(n <= h->arena_end - h->arena_used) {
-		// Keep taking from our reservation.
-		p = h->arena_used;
-		runtime·SysMap(p, n, h->arena_reserved, &mstats.heap_sys);
-		h->arena_used += n;
-		runtime·MHeap_MapBits(h);
-		runtime·MHeap_MapSpans(h);
-		if(raceenabled)
-			runtime·racemapshadow(p, n);
-		
-		if(((uintptr)p & (PageSize-1)) != 0)
-			runtime·throw("misrounded allocation in MHeap_SysAlloc");
-		return p;
-	}
-	
-	// If using 64-bit, our reservation is all we have.
-	if(h->arena_end - h->arena_start >= MaxArena32)
-		return nil;
-
-	// On 32-bit, once the reservation is gone we can
-	// try to get memory at a location chosen by the OS
-	// and hope that it is in the range we allocated bitmap for.
-	p_size = ROUND(n, PageSize) + PageSize;
-	p = runtime·sysAlloc(p_size, &mstats.heap_sys);
-	if(p == nil)
-		return nil;
-
-	if(p < h->arena_start || p+p_size - h->arena_start >= MaxArena32) {
-		runtime·printf("runtime: memory allocated by OS (%p) not in usable range [%p,%p)\n",
-			p, h->arena_start, h->arena_start+MaxArena32);
-		runtime·SysFree(p, p_size, &mstats.heap_sys);
-		return nil;
-	}
-	
-	p_end = p + p_size;
-	p += -(uintptr)p & (PageSize-1);
-	if(p+n > h->arena_used) {
-		h->arena_used = p+n;
-		if(p_end > h->arena_end)
-			h->arena_end = p_end;
-		runtime·MHeap_MapBits(h);
-		runtime·MHeap_MapSpans(h);
-		if(raceenabled)
-			runtime·racemapshadow(p, n);
-	}
-	
-	if(((uintptr)p & (PageSize-1)) != 0)
-		runtime·throw("misrounded allocation in MHeap_SysAlloc");
-	return p;
-}
-
-void
-runtime·setFinalizer_m(void)
-{
-	FuncVal *fn;
-	void *arg;
-	uintptr nret;
-	Type *fint;
-	PtrType *ot;
-
-	fn = g->m->ptrarg[0];
-	arg = g->m->ptrarg[1];
-	nret = g->m->scalararg[0];
-	fint = g->m->ptrarg[2];
-	ot = g->m->ptrarg[3];
-	g->m->ptrarg[0] = nil;
-	g->m->ptrarg[1] = nil;
-	g->m->ptrarg[2] = nil;
-	g->m->ptrarg[3] = nil;
-
-	g->m->scalararg[0] = runtime·addfinalizer(arg, fn, nret, fint, ot);
-}
-
-void
-runtime·removeFinalizer_m(void)
-{
-	void *p;
-
-	p = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-	runtime·removefinalizer(p);
-}
-
-// mcallable cache refill
-void 
-runtime·mcacheRefill_m(void)
-{
-	runtime·MCache_Refill(g->m->mcache, (int32)g->m->scalararg[0]);
-}
-
-void
-runtime·largeAlloc_m(void)
-{
-	uintptr npages, size;
-	MSpan *s;
-	void *v;
-	int32 flag;
-
-	//runtime·printf("largeAlloc size=%D\n", g->m->scalararg[0]);
-	// Allocate directly from heap.
-	size = g->m->scalararg[0];
-	flag = (int32)g->m->scalararg[1];
-	if(size + PageSize < size)
-		runtime·throw("out of memory");
-	npages = size >> PageShift;
-	if((size & PageMask) != 0)
-		npages++;
-	s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
-	if(s == nil)
-		runtime·throw("out of memory");
-	s->limit = (byte*)(s->start<<PageShift) + size;
-	v = (void*)(s->start << PageShift);
-	// setup for mark sweep
-	runtime·markspan(v, 0, 0, true);
-	g->m->ptrarg[0] = s;
-}

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 8cf1c3d..a117245 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go

@@ -28,10 +28,11 @@
 	maxGCMask       = _MaxGCMask
 	bitsDead        = _BitsDead
 	bitsPointer     = _BitsPointer
+	bitsScalar      = _BitsScalar
 
 	mSpanInUse = _MSpanInUse
 
-	concurrentSweep = _ConcurrentSweep != 0
+	concurrentSweep = _ConcurrentSweep
 )
 
 // Page number (address>>pageShift)
@@ -142,10 +143,9 @@
 			s = c.alloc[tinySizeClass]
 			v := s.freelist
 			if v == nil {
-				mp := acquirem()
-				mp.scalararg[0] = tinySizeClass
-				onM(mcacheRefill_m)
-				releasem(mp)
+				onM(func() {
+					mCache_Refill(c, tinySizeClass)
+				})
 				s = c.alloc[tinySizeClass]
 				v = s.freelist
 			}
@@ -173,10 +173,9 @@
 			s = c.alloc[sizeclass]
 			v := s.freelist
 			if v == nil {
-				mp := acquirem()
-				mp.scalararg[0] = uintptr(sizeclass)
-				onM(mcacheRefill_m)
-				releasem(mp)
+				onM(func() {
+					mCache_Refill(c, int32(sizeclass))
+				})
 				s = c.alloc[sizeclass]
 				v = s.freelist
 			}
@@ -193,13 +192,10 @@
 		}
 		c.local_cachealloc += intptr(size)
 	} else {
-		mp := acquirem()
-		mp.scalararg[0] = uintptr(size)
-		mp.scalararg[1] = uintptr(flags)
-		onM(largeAlloc_m)
-		s = (*mspan)(mp.ptrarg[0])
-		mp.ptrarg[0] = nil
-		releasem(mp)
+		var s *mspan
+		onM(func() {
+			s = largeAlloc(size, uint32(flags))
+		})
 		x = unsafe.Pointer(uintptr(s.start << pageShift))
 		size = uintptr(s.elemsize)
 	}
@@ -359,7 +355,7 @@
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
-	if int(n) < 0 || (typ.size > 0 && n > maxmem/uintptr(typ.size)) {
+	if int(n) < 0 || (typ.size > 0 && n > _MaxMem/uintptr(typ.size)) {
 		panic("runtime: allocation size out of range")
 	}
 	return mallocgc(uintptr(typ.size)*n, typ, flags)
@@ -585,10 +581,9 @@
 	ftyp := f._type
 	if ftyp == nil {
 		// switch to M stack and remove finalizer
-		mp := acquirem()
-		mp.ptrarg[0] = e.data
-		onM(removeFinalizer_m)
-		releasem(mp)
+		onM(func() {
+			removefinalizer(e.data)
+		})
 		return
 	}
 
@@ -633,18 +628,11 @@
 	// make sure we have a finalizer goroutine
 	createfing()
 
-	// switch to M stack to add finalizer record
-	mp := acquirem()
-	mp.ptrarg[0] = f.data
-	mp.ptrarg[1] = e.data
-	mp.scalararg[0] = nret
-	mp.ptrarg[2] = unsafe.Pointer(fint)
-	mp.ptrarg[3] = unsafe.Pointer(ot)
-	onM(setFinalizer_m)
-	if mp.scalararg[0] != 1 {
-		gothrow("runtime.SetFinalizer: finalizer already set")
-	}
-	releasem(mp)
+	onM(func() {
+		if !addfinalizer(e.data, (*funcval)(f.data), nret, fint, ot) {
+			gothrow("runtime.SetFinalizer: finalizer already set")
+		}
+	})
 }
 
 // round n up to a multiple of a.  a must be a power of 2.

diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h
deleted file mode 100644
index adb8d3d..0000000
--- a/src/runtime/malloc.h
+++ /dev/null

@@ -1,621 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Memory allocator, based on tcmalloc.
-// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
-
-// The main allocator works in runs of pages.
-// Small allocation sizes (up to and including 32 kB) are
-// rounded to one of about 100 size classes, each of which
-// has its own free list of objects of exactly that size.
-// Any free page of memory can be split into a set of objects
-// of one size class, which are then managed using free list
-// allocators.
-//
-// The allocator's data structures are:
-//
-//	FixAlloc: a free-list allocator for fixed-size objects,
-//		used to manage storage used by the allocator.
-//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
-//	MSpan: a run of pages managed by the MHeap.
-//	MCentral: a shared free list for a given size class.
-//	MCache: a per-thread (in Go, per-P) cache for small objects.
-//	MStats: allocation statistics.
-//
-// Allocating a small object proceeds up a hierarchy of caches:
-//
-//	1. Round the size up to one of the small size classes
-//	   and look in the corresponding MCache free list.
-//	   If the list is not empty, allocate an object from it.
-//	   This can all be done without acquiring a lock.
-//
-//	2. If the MCache free list is empty, replenish it by
-//	   taking a bunch of objects from the MCentral free list.
-//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
-//
-//	3. If the MCentral free list is empty, replenish it by
-//	   allocating a run of pages from the MHeap and then
-//	   chopping that memory into a objects of the given size.
-//	   Allocating many objects amortizes the cost of locking
-//	   the heap.
-//
-//	4. If the MHeap is empty or has no page runs large enough,
-//	   allocate a new group of pages (at least 1MB) from the
-//	   operating system.  Allocating a large run of pages
-//	   amortizes the cost of talking to the operating system.
-//
-// Freeing a small object proceeds up the same hierarchy:
-//
-//	1. Look up the size class for the object and add it to
-//	   the MCache free list.
-//
-//	2. If the MCache free list is too long or the MCache has
-//	   too much memory, return some to the MCentral free lists.
-//
-//	3. If all the objects in a given span have returned to
-//	   the MCentral list, return that span to the page heap.
-//
-//	4. If the heap has too much memory, return some to the
-//	   operating system.
-//
-//	TODO(rsc): Step 4 is not implemented.
-//
-// Allocating and freeing a large object uses the page heap
-// directly, bypassing the MCache and MCentral free lists.
-//
-// The small objects on the MCache and MCentral free lists
-// may or may not be zeroed.  They are zeroed if and only if
-// the second word of the object is zero.  A span in the
-// page heap is zeroed unless s->needzero is set. When a span
-// is allocated to break into small objects, it is zeroed if needed
-// and s->needzero is set. There are two main benefits to delaying the
-// zeroing this way:
-//
-//	1. stack frames allocated from the small object lists
-//	   or the page heap can avoid zeroing altogether.
-//	2. the cost of zeroing when reusing a small object is
-//	   charged to the mutator, not the garbage collector.
-//
-// This C code was written with an eye toward translating to Go
-// in the future.  Methods have the form Type_Method(Type *t, ...).
-
-typedef struct MCentral	MCentral;
-typedef struct MHeap	MHeap;
-typedef struct MSpan	MSpan;
-typedef struct MStats	MStats;
-typedef struct MLink	MLink;
-typedef struct GCStats	GCStats;
-
-enum
-{
-	PageShift	= 13,
-	PageSize	= 1<<PageShift,
-	PageMask	= PageSize - 1,
-};
-typedef	uintptr	pageID;		// address >> PageShift
-
-enum
-{
-	// Computed constant.  The definition of MaxSmallSize and the
-	// algorithm in msize.c produce some number of different allocation
-	// size classes.  NumSizeClasses is that number.  It's needed here
-	// because there are static arrays of this length; when msize runs its
-	// size choosing algorithm it double-checks that NumSizeClasses agrees.
-	NumSizeClasses = 67,
-
-	// Tunable constants.
-	MaxSmallSize = 32<<10,
-
-	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
-	TinySize = 16,
-	TinySizeClass = 2,
-
-	FixAllocChunk = 16<<10,		// Chunk size for FixAlloc
-	MaxMHeapList = 1<<(20 - PageShift),	// Maximum page length for fixed-size list in MHeap.
-	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
-
-	// Per-P, per order stack segment cache size.
-	StackCacheSize = 32*1024,
-	// Number of orders that get caching.  Order 0 is FixedStack
-	// and each successive order is twice as large.
-	NumStackOrders = 3,
-
-	// Number of bits in page to span calculations (4k pages).
-	// On Windows 64-bit we limit the arena to 32GB or 35 bits (see below for reason).
-	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
-	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
-#ifdef _64BIT
-#ifdef GOOS_windows
-	// Windows counts memory used by page table into committed memory
-	// of the process, so we can't reserve too much memory.
-	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
-	MHeapMap_Bits = 35 - PageShift,
-#else
-	MHeapMap_Bits = 37 - PageShift,
-#endif
-#else
-	MHeapMap_Bits = 32 - PageShift,
-#endif
-
-	// Max number of threads to run garbage collection.
-	// 2, 3, and 4 are all plausible maximums depending
-	// on the hardware details of the machine.  The garbage
-	// collector scales well to 32 cpus.
-	MaxGcproc = 32,
-};
-
-// Maximum memory allocation size, a hint for callers.
-// This must be a #define instead of an enum because it
-// is so large.
-#ifdef _64BIT
-#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB or 32 GB */
-#else
-#define	MaxMem	((uintptr)-1)
-#endif
-
-// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
-struct MLink
-{
-	MLink *next;
-};
-
-// sysAlloc obtains a large chunk of zeroed memory from the
-// operating system, typically on the order of a hundred kilobytes
-// or a megabyte.
-// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by sysAlloc.
-//
-// SysUnused notifies the operating system that the contents
-// of the memory region are no longer needed and can be reused
-// for other purposes.
-// SysUsed notifies the operating system that the contents
-// of the memory region are needed again.
-//
-// SysFree returns it unconditionally; this is only used if
-// an out-of-memory error has been detected midway through
-// an allocation.  It is okay if SysFree is a no-op.
-//
-// SysReserve reserves address space without allocating memory.
-// If the pointer passed to it is non-nil, the caller wants the
-// reservation there, but SysReserve can still choose another
-// location if that one is unavailable.  On some systems and in some
-// cases SysReserve will simply check that the address space is
-// available and not actually reserve it.  If SysReserve returns
-// non-nil, it sets *reserved to true if the address space is
-// reserved, false if it has merely been checked.
-// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by sysAlloc.
-//
-// SysMap maps previously reserved address space for use.
-// The reserved argument is true if the address space was really
-// reserved, not merely checked.
-//
-// SysFault marks a (already sysAlloc'd) region to fault
-// if accessed.  Used only for debugging the runtime.
-
-void*	runtime·sysAlloc(uintptr nbytes, uint64 *stat);
-void	runtime·SysFree(void *v, uintptr nbytes, uint64 *stat);
-void	runtime·SysUnused(void *v, uintptr nbytes);
-void	runtime·SysUsed(void *v, uintptr nbytes);
-void	runtime·SysMap(void *v, uintptr nbytes, bool reserved, uint64 *stat);
-void*	runtime·SysReserve(void *v, uintptr nbytes, bool *reserved);
-void	runtime·SysFault(void *v, uintptr nbytes);
-
-// FixAlloc is a simple free-list allocator for fixed size objects.
-// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
-// MCache and MSpan objects.
-//
-// Memory returned by FixAlloc_Alloc is not zeroed.
-// The caller is responsible for locking around FixAlloc calls.
-// Callers can keep state in the object but the first word is
-// smashed by freeing and reallocating.
-struct FixAlloc
-{
-	uintptr	size;
-	void	(*first)(void *arg, byte *p);	// called first time p is returned
-	void*	arg;
-	MLink*	list;
-	byte*	chunk;
-	uint32	nchunk;
-	uintptr	inuse;	// in-use bytes now
-	uint64*	stat;
-};
-
-void	runtime·FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat);
-void*	runtime·FixAlloc_Alloc(FixAlloc *f);
-void	runtime·FixAlloc_Free(FixAlloc *f, void *p);
-
-
-// Statistics.
-// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
-struct MStats
-{
-	// General statistics.
-	uint64	alloc;		// bytes allocated and still in use
-	uint64	total_alloc;	// bytes allocated (even if freed)
-	uint64	sys;		// bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
-	uint64	nlookup;	// number of pointer lookups
-	uint64	nmalloc;	// number of mallocs
-	uint64	nfree;  // number of frees
-
-	// Statistics about malloc heap.
-	// protected by mheap.lock
-	uint64	heap_alloc;	// bytes allocated and still in use
-	uint64	heap_sys;	// bytes obtained from system
-	uint64	heap_idle;	// bytes in idle spans
-	uint64	heap_inuse;	// bytes in non-idle spans
-	uint64	heap_released;	// bytes released to the OS
-	uint64	heap_objects;	// total number of allocated objects
-
-	// Statistics about allocation of low-level fixed-size structures.
-	// Protected by FixAlloc locks.
-	uint64	stacks_inuse;	// this number is included in heap_inuse above
-	uint64	stacks_sys;	// always 0 in mstats
-	uint64	mspan_inuse;	// MSpan structures
-	uint64	mspan_sys;
-	uint64	mcache_inuse;	// MCache structures
-	uint64	mcache_sys;
-	uint64	buckhash_sys;	// profiling bucket hash table
-	uint64	gc_sys;
-	uint64	other_sys;
-
-	// Statistics about garbage collector.
-	// Protected by mheap or stopping the world during GC.
-	uint64	next_gc;	// next GC (in heap_alloc time)
-	uint64  last_gc;	// last GC (in absolute time)
-	uint64	pause_total_ns;
-	uint64	pause_ns[256];  // circular buffer of recent GC pause lengths
-	uint64	pause_end[256]; // circular buffer of recent GC end times (nanoseconds since 1970)
-	uint32	numgc;
-	bool	enablegc;
-	bool	debuggc;
-
-	// Statistics about allocation size classes.
-	
-	struct MStatsBySize {
-		uint32 size;
-		uint64 nmalloc;
-		uint64 nfree;
-	} by_size[NumSizeClasses];
-	
-	uint64	tinyallocs;	// number of tiny allocations that didn't cause actual allocation; not exported to Go directly
-};
-
-
-#define mstats runtime·memstats
-extern MStats mstats;
-void	runtime·updatememstats(GCStats *stats);
-void	runtime·ReadMemStats(MStats *stats);
-
-// Size classes.  Computed and initialized by InitSizes.
-//
-// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
-//	1 <= sizeclass < NumSizeClasses, for n.
-//	Size class 0 is reserved to mean "not small".
-//
-// class_to_size[i] = largest size in class i
-// class_to_allocnpages[i] = number of pages to allocate when
-//	making new objects in class i
-
-int32	runtime·SizeToClass(int32);
-uintptr	runtime·roundupsize(uintptr);
-extern	int32	runtime·class_to_size[NumSizeClasses];
-extern	int32	runtime·class_to_allocnpages[NumSizeClasses];
-extern	int8	runtime·size_to_class8[1024/8 + 1];
-extern	int8	runtime·size_to_class128[(MaxSmallSize-1024)/128 + 1];
-extern	void	runtime·InitSizes(void);
-
-typedef struct MCacheList MCacheList;
-struct MCacheList
-{
-	MLink *list;
-	uint32 nlist;
-};
-
-typedef struct StackFreeList StackFreeList;
-struct StackFreeList
-{
-	MLink *list;  // linked list of free stacks
-	uintptr size; // total size of stacks in list
-};
-
-typedef struct SudoG SudoG;
-
-// Per-thread (in Go, per-P) cache for small objects.
-// No locking needed because it is per-thread (per-P).
-struct MCache
-{
-	// The following members are accessed on every malloc,
-	// so they are grouped here for better caching.
-	int32 next_sample;		// trigger heap sample after allocating this many bytes
-	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
-	// Allocator cache for tiny objects w/o pointers.
-	// See "Tiny allocator" comment in malloc.goc.
-	byte*	tiny;
-	uintptr	tinysize;
-	uintptr	local_tinyallocs;	// number of tiny allocs not counted in other stats
-	// The rest is not accessed on every malloc.
-	MSpan*	alloc[NumSizeClasses];	// spans to allocate from
-
-	StackFreeList stackcache[NumStackOrders];
-
-	SudoG*	sudogcache;
-
-	void*	gcworkbuf;
-
-	// Local allocator stats, flushed during GC.
-	uintptr local_nlookup;		// number of pointer lookups
-	uintptr local_largefree;	// bytes freed for large objects (>MaxSmallSize)
-	uintptr local_nlargefree;	// number of frees for large objects (>MaxSmallSize)
-	uintptr local_nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
-};
-
-MSpan*	runtime·MCache_Refill(MCache *c, int32 sizeclass);
-void	runtime·MCache_ReleaseAll(MCache *c);
-void	runtime·stackcache_clear(MCache *c);
-void	runtime·gcworkbuffree(void *b);
-
-enum
-{
-	KindSpecialFinalizer = 1,
-	KindSpecialProfile = 2,
-	// Note: The finalizer special must be first because if we're freeing
-	// an object, a finalizer special will cause the freeing operation
-	// to abort, and we want to keep the other special records around
-	// if that happens.
-};
-
-typedef struct Special Special;
-struct Special
-{
-	Special*	next;	// linked list in span
-	uint16		offset;	// span offset of object
-	byte		kind;	// kind of Special
-};
-
-// The described object has a finalizer set for it.
-typedef struct SpecialFinalizer SpecialFinalizer;
-struct SpecialFinalizer
-{
-	Special		special;
-	FuncVal*	fn;
-	uintptr		nret;
-	Type*		fint;
-	PtrType*	ot;
-};
-
-// The described object is being heap profiled.
-typedef struct Bucket Bucket; // from mprof.h
-typedef struct SpecialProfile SpecialProfile;
-struct SpecialProfile
-{
-	Special	special;
-	Bucket*	b;
-};
-
-// An MSpan is a run of pages.
-enum
-{
-	MSpanInUse = 0, // allocated for garbage collected heap
-	MSpanStack,     // allocated for use by stack allocator
-	MSpanFree,
-	MSpanListHead,
-	MSpanDead,
-};
-struct MSpan
-{
-	MSpan	*next;		// in a span linked list
-	MSpan	*prev;		// in a span linked list
-	pageID	start;		// starting page number
-	uintptr	npages;		// number of pages in span
-	MLink	*freelist;	// list of free objects
-	// sweep generation:
-	// if sweepgen == h->sweepgen - 2, the span needs sweeping
-	// if sweepgen == h->sweepgen - 1, the span is currently being swept
-	// if sweepgen == h->sweepgen, the span is swept and ready to use
-	// h->sweepgen is incremented by 2 after every GC
-	uint32	sweepgen;
-	uint16	ref;		// capacity - number of objects in freelist
-	uint8	sizeclass;	// size class
-	bool	incache;	// being used by an MCache
-	uint8	state;		// MSpanInUse etc
-	uint8	needzero;	// needs to be zeroed before allocation
-	uintptr	elemsize;	// computed from sizeclass or from npages
-	int64   unusedsince;	// First time spotted by GC in MSpanFree state
-	uintptr npreleased;	// number of pages released to the OS
-	byte	*limit;		// end of data in span
-	Mutex	specialLock;	// guards specials list
-	Special	*specials;	// linked list of special records sorted by offset.
-};
-
-void	runtime·MSpan_Init(MSpan *span, pageID start, uintptr npages);
-void	runtime·MSpan_EnsureSwept(MSpan *span);
-bool	runtime·MSpan_Sweep(MSpan *span, bool preserve);
-
-// Every MSpan is in one doubly-linked list,
-// either one of the MHeap's free lists or one of the
-// MCentral's span lists.  We use empty MSpan structures as list heads.
-void	runtime·MSpanList_Init(MSpan *list);
-bool	runtime·MSpanList_IsEmpty(MSpan *list);
-void	runtime·MSpanList_Insert(MSpan *list, MSpan *span);
-void	runtime·MSpanList_InsertBack(MSpan *list, MSpan *span);
-void	runtime·MSpanList_Remove(MSpan *span);	// from whatever list it is in
-
-
-// Central list of free objects of a given size.
-struct MCentral
-{
-	Mutex  lock;
-	int32 sizeclass;
-	MSpan nonempty;	// list of spans with a free object
-	MSpan empty;	// list of spans with no free objects (or cached in an MCache)
-};
-
-void	runtime·MCentral_Init(MCentral *c, int32 sizeclass);
-MSpan*	runtime·MCentral_CacheSpan(MCentral *c);
-void	runtime·MCentral_UncacheSpan(MCentral *c, MSpan *s);
-bool	runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end, bool preserve);
-
-// Main malloc heap.
-// The heap itself is the "free[]" and "large" arrays,
-// but all the other global data is here too.
-struct MHeap
-{
-	Mutex  lock;
-	MSpan free[MaxMHeapList];	// free lists of given length
-	MSpan freelarge;		// free lists length >= MaxMHeapList
-	MSpan busy[MaxMHeapList];	// busy lists of large objects of given length
-	MSpan busylarge;		// busy lists of large objects length >= MaxMHeapList
-	MSpan **allspans;		// all spans out there
-	MSpan **gcspans;		// copy of allspans referenced by GC marker or sweeper
-	uint32	nspan;
-	uint32	nspancap;
-	uint32	sweepgen;		// sweep generation, see comment in MSpan
-	uint32	sweepdone;		// all spans are swept
-
-	// span lookup
-	MSpan**	spans;
-	uintptr	spans_mapped;
-
-	// range of addresses we might see in the heap
-	byte *bitmap;
-	uintptr bitmap_mapped;
-	byte *arena_start;
-	byte *arena_used;
-	byte *arena_end;
-	bool arena_reserved;
-
-	// central free lists for small size classes.
-	// the padding makes sure that the MCentrals are
-	// spaced CacheLineSize bytes apart, so that each MCentral.lock
-	// gets its own cache line.
-	struct MHeapCentral {
-		MCentral mcentral;
-		byte pad[CacheLineSize];
-	} central[NumSizeClasses];
-
-	FixAlloc spanalloc;	// allocator for Span*
-	FixAlloc cachealloc;	// allocator for MCache*
-	FixAlloc specialfinalizeralloc;	// allocator for SpecialFinalizer*
-	FixAlloc specialprofilealloc;	// allocator for SpecialProfile*
-	Mutex speciallock; // lock for sepcial record allocators.
-
-	// Malloc stats.
-	uint64 largefree;	// bytes freed for large objects (>MaxSmallSize)
-	uint64 nlargefree;	// number of frees for large objects (>MaxSmallSize)
-	uint64 nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
-};
-#define runtime·mheap runtime·mheap_
-extern MHeap runtime·mheap;
-
-void	runtime·MHeap_Init(MHeap *h);
-MSpan*	runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero);
-MSpan*	runtime·MHeap_AllocStack(MHeap *h, uintptr npage);
-void	runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct);
-void	runtime·MHeap_FreeStack(MHeap *h, MSpan *s);
-MSpan*	runtime·MHeap_Lookup(MHeap *h, void *v);
-MSpan*	runtime·MHeap_LookupMaybe(MHeap *h, void *v);
-void*	runtime·MHeap_SysAlloc(MHeap *h, uintptr n);
-void	runtime·MHeap_MapBits(MHeap *h);
-void	runtime·MHeap_MapSpans(MHeap *h);
-void	runtime·MHeap_Scavenge(int32 k, uint64 now, uint64 limit);
-
-void*	runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
-int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
-uintptr	runtime·sweepone(void);
-void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
-void	runtime·unmarkspan(void *v, uintptr size);
-void	runtime·purgecachedstats(MCache*);
-void	runtime·tracealloc(void*, uintptr, Type*);
-void	runtime·tracefree(void*, uintptr);
-void	runtime·tracegc(void);
-
-int32	runtime·gcpercent;
-int32	runtime·readgogc(void);
-void	runtime·clearpools(void);
-
-enum
-{
-	// flags to malloc
-	FlagNoScan	= 1<<0,	// GC doesn't have to scan object
-	FlagNoZero	= 1<<1, // don't zero memory
-};
-
-void	runtime·mProf_Malloc(void*, uintptr);
-void	runtime·mProf_Free(Bucket*, uintptr, bool);
-void	runtime·mProf_GC(void);
-void	runtime·iterate_memprof(void (**callback)(Bucket*, uintptr, uintptr*, uintptr, uintptr, uintptr));
-int32	runtime·gcprocs(void);
-void	runtime·helpgc(int32 nproc);
-void	runtime·gchelper(void);
-void	runtime·createfing(void);
-G*	runtime·wakefing(void);
-void	runtime·getgcmask(byte*, Type*, byte**, uintptr*);
-
-// NOTE: Layout known to queuefinalizer.
-typedef struct Finalizer Finalizer;
-struct Finalizer
-{
-	FuncVal *fn;	// function to call
-	void *arg;	// ptr to object
-	uintptr nret;	// bytes of return values from fn
-	Type *fint;	// type of first argument of fn
-	PtrType *ot;	// type of ptr to object
-};
-
-typedef struct FinBlock FinBlock;
-struct FinBlock
-{
-	FinBlock *alllink;
-	FinBlock *next;
-	int32 cnt;
-	int32 cap;
-	Finalizer fin[1];
-};
-extern Mutex	runtime·finlock;	// protects the following variables
-extern G*	runtime·fing;
-extern bool	runtime·fingwait;
-extern bool	runtime·fingwake;
-extern FinBlock	*runtime·finq;		// list of finalizers that are to be executed
-extern FinBlock	*runtime·finc;		// cache of free blocks
-
-void	runtime·setprofilebucket_m(void);
-
-bool	runtime·addfinalizer(void*, FuncVal *fn, uintptr, Type*, PtrType*);
-void	runtime·removefinalizer(void*);
-void	runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot);
-bool	runtime·freespecial(Special *s, void *p, uintptr size, bool freed);
-
-// Information from the compiler about the layout of stack frames.
-struct BitVector
-{
-	int32 n; // # of bits
-	uint8 *bytedata;
-};
-typedef struct StackMap StackMap;
-struct StackMap
-{
-	int32 n; // number of bitmaps
-	int32 nbit; // number of bits in each bitmap
-	uint8 bytedata[]; // bitmaps, each starting on a 32-bit boundary
-};
-// Returns pointer map data for the given stackmap index
-// (the index is encoded in PCDATA_StackMapIndex).
-BitVector	runtime·stackmapdata(StackMap *stackmap, int32 n);
-
-extern	BitVector	runtime·gcdatamask;
-extern	BitVector	runtime·gcbssmask;
-
-// defined in mgc0.go
-void	runtime·gc_m_ptr(Eface*);
-void	runtime·gc_g_ptr(Eface*);
-void	runtime·gc_itab_ptr(Eface*);
-
-void  runtime·setgcpercent_m(void);
-
-// Value we use to mark dead pointers when GODEBUG=gcdead=1.
-#define PoisonGC ((uintptr)0xf969696969696969ULL)
-#define PoisonStack ((uintptr)0x6868686868686868ULL)

diff --git a/src/runtime/malloc1.go b/src/runtime/malloc1.go
new file mode 100644
index 0000000..db02d9c
--- /dev/null
+++ b/src/runtime/malloc1.go

@@ -0,0 +1,318 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// See malloc.h for overview.
+//
+// TODO(rsc): double-check stats.
+
+package runtime
+
+import "unsafe"
+
+const _MaxArena32 = 2 << 30
+
+// For use by Go. If it were a C enum it would be made available automatically,
+// but the value of MaxMem is too large for enum.
+// XXX - uintptr runtime·maxmem = MaxMem;
+
+func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
+	_g_ := getg()
+
+	_g_.m.mcache.local_nlookup++
+	if ptrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(_g_.m.mcache)
+		unlock(&mheap_.lock)
+	}
+
+	s := mHeap_LookupMaybe(&mheap_, unsafe.Pointer(v))
+	if sp != nil {
+		*sp = s
+	}
+	if s == nil {
+		if base != nil {
+			*base = 0
+		}
+		if size != nil {
+			*size = 0
+		}
+		return 0
+	}
+
+	p := uintptr(s.start) << _PageShift
+	if s.sizeclass == 0 {
+		// Large object.
+		if base != nil {
+			*base = p
+		}
+		if size != nil {
+			*size = s.npages << _PageShift
+		}
+		return 1
+	}
+
+	n := s.elemsize
+	if base != nil {
+		i := (uintptr(v) - uintptr(p)) / n
+		*base = p + i*n
+	}
+	if size != nil {
+		*size = n
+	}
+
+	return 1
+}
+
+//go:nosplit
+func purgecachedstats(c *mcache) {
+	// Protected by either heap or GC lock.
+	h := &mheap_
+	memstats.heap_alloc += uint64(c.local_cachealloc)
+	c.local_cachealloc = 0
+	memstats.tinyallocs += uint64(c.local_tinyallocs)
+	c.local_tinyallocs = 0
+	memstats.nlookup += uint64(c.local_nlookup)
+	c.local_nlookup = 0
+	h.largefree += uint64(c.local_largefree)
+	c.local_largefree = 0
+	h.nlargefree += uint64(c.local_nlargefree)
+	c.local_nlargefree = 0
+	for i := 0; i < len(c.local_nsmallfree); i++ {
+		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
+		c.local_nsmallfree[i] = 0
+	}
+}
+
+func mallocinit() {
+	initSizes()
+
+	if class_to_size[_TinySizeClass] != _TinySize {
+		gothrow("bad TinySizeClass")
+	}
+
+	var p, arena_size, bitmap_size, spans_size, p_size, limit uintptr
+	var reserved bool
+
+	// limit = runtime.memlimit();
+	// See https://code.google.com/p/go/issues/detail?id=5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0
+
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found.  The arena begins with a bitmap large
+	// enough to hold 4 bits per allocated word.
+	if ptrSize == 8 && (limit == 0 || limit > 1<<30) {
+		// On a 64-bit machine, allocate from a single contiguous reservation.
+		// 128 GB (MaxMem) should be big enough for now.
+		//
+		// The code will work with the reservation at any address, but ask
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
+		// Allocating a 128 GB region takes away 37 bits, and the amd64
+		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
+		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from
+		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
+		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices are both for debuggability and to reduce the
+		// odds of the conservative garbage collector not collecting memory
+		// because some non-pointer block of memory had a bit pattern
+		// that matched a memory address.
+		//
+		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
+		//
+		// If this fails we fall back to the 32 bit memory mechanism
+		arena_size = round(_MaxMem, _PageSize)
+		bitmap_size = arena_size / (ptrSize * 8 / 4)
+		spans_size = arena_size / _PageSize * ptrSize
+		spans_size = round(spans_size, _PageSize)
+		for i := 0; i <= 0x7f; i++ {
+			p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+			p_size = bitmap_size + spans_size + arena_size + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), p_size, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+	}
+
+	if p == 0 {
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle another 2GB of mappings (256 MB),
+		// along with a reservation for another 512 MB of memory.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere and hope it's in the 2GB
+		// following the bitmap (presumably the executable begins
+		// near the bottom of memory, so we'll have to use up
+		// most of memory before the kernel resorts to giving out
+		// memory before the beginning of the text segment).
+		//
+		// Alternatively we could reserve 512 MB bitmap, enough
+		// for 4GB of mappings, and then accept any memory the
+		// kernel threw at us, but normally that's a waste of 512 MB
+		// of address space, which is probably too much in a 32-bit world.
+		bitmap_size = _MaxArena32 / (ptrSize * 8 / 4)
+		arena_size = 512 << 20
+		spans_size = _MaxArena32 / _PageSize * ptrSize
+		if limit > 0 && arena_size+bitmap_size+spans_size > limit {
+			bitmap_size = (limit / 9) &^ ((1 << _PageShift) - 1)
+			arena_size = bitmap_size * 8
+			spans_size = arena_size / _PageSize * ptrSize
+		}
+		spans_size = round(spans_size, _PageSize)
+
+		// SysReserve treats the address we ask for, end, as a hint,
+		// not as an absolute requirement.  If we ask for the end
+		// of the data segment but the operating system requires
+		// a little more space before we can start allocating, it will
+		// give out a slightly higher pointer.  Except QEMU, which
+		// is buggy, as usual: it won't adjust the pointer upward.
+		// So adjust it upward a little bit ourselves: 1/4 MB to get
+		// away from the running binary image and then round up
+		// to a MB boundary.
+		p = round(uintptr(unsafe.Pointer(&end))+(1<<18), 1<<20)
+		p_size = bitmap_size + spans_size + arena_size + _PageSize
+		p = uintptr(sysReserve(unsafe.Pointer(p), p_size, &reserved))
+		if p == 0 {
+			gothrow("runtime: cannot reserve arena virtual address space")
+		}
+	}
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 := round(p, _PageSize)
+
+	mheap_.spans = (**mspan)(unsafe.Pointer(p1))
+	mheap_.bitmap = p1 + spans_size
+	mheap_.arena_start = p1 + (spans_size + bitmap_size)
+	mheap_.arena_used = mheap_.arena_start
+	mheap_.arena_end = p + p_size
+	mheap_.arena_reserved = reserved
+
+	if mheap_.arena_start&(_PageSize-1) != 0 {
+		println("bad pagesize", hex(p), hex(p1), hex(spans_size), hex(bitmap_size), hex(_PageSize), "start", hex(mheap_.arena_start))
+		gothrow("misrounded allocation in mallocinit")
+	}
+
+	// Initialize the rest of the allocator.
+	mHeap_Init(&mheap_, spans_size)
+	_g_ := getg()
+	_g_.m.mcache = allocmcache()
+}
+
+func mHeap_SysAlloc(h *mheap, n uintptr) unsafe.Pointer {
+	if n > uintptr(h.arena_end)-uintptr(h.arena_used) {
+		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
+		// Reserve some more space.
+		p_size := round(n+_PageSize, 256<<20)
+		new_end := h.arena_end + p_size
+		if new_end <= h.arena_start+_MaxArena32 {
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			var reserved bool
+			p := uintptr(sysReserve((unsafe.Pointer)(h.arena_end), p_size, &reserved))
+			if p == h.arena_end {
+				h.arena_end = new_end
+				h.arena_reserved = reserved
+			} else if p+p_size <= h.arena_start+_MaxArena32 {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h.arena_end = p + p_size
+				h.arena_used = p + (-uintptr(p) & (_PageSize - 1))
+				h.arena_reserved = reserved
+			} else {
+				var stat uint64
+				sysFree((unsafe.Pointer)(p), p_size, &stat)
+			}
+		}
+	}
+
+	if n <= uintptr(h.arena_end)-uintptr(h.arena_used) {
+		// Keep taking from our reservation.
+		p := h.arena_used
+		sysMap((unsafe.Pointer)(p), n, h.arena_reserved, &memstats.heap_sys)
+		h.arena_used += n
+		mHeap_MapBits(h)
+		mHeap_MapSpans(h)
+		if raceenabled {
+			racemapshadow((unsafe.Pointer)(p), n)
+		}
+
+		if uintptr(p)&(_PageSize-1) != 0 {
+			gothrow("misrounded allocation in MHeap_SysAlloc")
+		}
+		return (unsafe.Pointer)(p)
+	}
+
+	// If using 64-bit, our reservation is all we have.
+	if uintptr(h.arena_end)-uintptr(h.arena_start) >= _MaxArena32 {
+		return nil
+	}
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS
+	// and hope that it is in the range we allocated bitmap for.
+	p_size := round(n, _PageSize) + _PageSize
+	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
+	if p == 0 {
+		return nil
+	}
+
+	if p < h.arena_start || uintptr(p)+p_size-uintptr(h.arena_start) >= _MaxArena32 {
+		print("runtime: memory allocated by OS (", p, ") not in usable range [", hex(h.arena_start), ",", hex(h.arena_start+_MaxArena32), ")\n")
+		sysFree((unsafe.Pointer)(p), p_size, &memstats.heap_sys)
+		return nil
+	}
+
+	p_end := p + p_size
+	p += -p & (_PageSize - 1)
+	if uintptr(p)+n > uintptr(h.arena_used) {
+		h.arena_used = p + n
+		if p_end > h.arena_end {
+			h.arena_end = p_end
+		}
+		mHeap_MapBits(h)
+		mHeap_MapSpans(h)
+		if raceenabled {
+			racemapshadow((unsafe.Pointer)(p), n)
+		}
+	}
+
+	if uintptr(p)&(_PageSize-1) != 0 {
+		gothrow("misrounded allocation in MHeap_SysAlloc")
+	}
+	return (unsafe.Pointer)(p)
+}
+
+var end struct{}
+
+func largeAlloc(size uintptr, flag uint32) *mspan {
+	// print("largeAlloc size=", size, "\n")
+
+	if size+_PageSize < size {
+		gothrow("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+	s := mHeap_Alloc(&mheap_, npages, 0, true, flag&_FlagNoZero == 0)
+	if s == nil {
+		gothrow("out of memory")
+	}
+	s.limit = uintptr(s.start)<<_PageShift + size
+	v := unsafe.Pointer(uintptr(s.start) << _PageShift)
+	// setup for mark sweep
+	markspan(v, 0, 0, true)
+	return s
+}

diff --git a/src/runtime/malloc2.go b/src/runtime/malloc2.go
new file mode 100644
index 0000000..e4bd963
--- /dev/null
+++ b/src/runtime/malloc2.go

@@ -0,0 +1,475 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Memory allocator, based on tcmalloc.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 100 size classes, each of which
+// has its own free list of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using free list
+// allocators.
+//
+// The allocator's data structures are:
+//
+//	FixAlloc: a free-list allocator for fixed-size objects,
+//		used to manage storage used by the allocator.
+//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
+//	MSpan: a run of pages managed by the MHeap.
+//	MCentral: a shared free list for a given size class.
+//	MCache: a per-thread (in Go, per-P) cache for small objects.
+//	MStats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding MCache free list.
+//	   If the list is not empty, allocate an object from it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the MCache free list is empty, replenish it by
+//	   taking a bunch of objects from the MCentral free list.
+//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
+//
+//	3. If the MCentral free list is empty, replenish it by
+//	   allocating a run of pages from the MHeap and then
+//	   chopping that memory into a objects of the given size.
+//	   Allocating many objects amortizes the cost of locking
+//	   the heap.
+//
+//	4. If the MHeap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system.  Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Freeing a small object proceeds up the same hierarchy:
+//
+//	1. Look up the size class for the object and add it to
+//	   the MCache free list.
+//
+//	2. If the MCache free list is too long or the MCache has
+//	   too much memory, return some to the MCentral free lists.
+//
+//	3. If all the objects in a given span have returned to
+//	   the MCentral list, return that span to the page heap.
+//
+//	4. If the heap has too much memory, return some to the
+//	   operating system.
+//
+//	TODO(rsc): Step 4 is not implemented.
+//
+// Allocating and freeing a large object uses the page heap
+// directly, bypassing the MCache and MCentral free lists.
+//
+// The small objects on the MCache and MCentral free lists
+// may or may not be zeroed.  They are zeroed if and only if
+// the second word of the object is zero.  A span in the
+// page heap is zeroed unless s->needzero is set. When a span
+// is allocated to break into small objects, it is zeroed if needed
+// and s->needzero is set. There are two main benefits to delaying the
+// zeroing this way:
+//
+//	1. stack frames allocated from the small object lists
+//	   or the page heap can avoid zeroing altogether.
+//	2. the cost of zeroing when reusing a small object is
+//	   charged to the mutator, not the garbage collector.
+//
+// This C code was written with an eye toward translating to Go
+// in the future.  Methods have the form Type_Method(Type *t, ...).
+
+const (
+	_PageShift = 13
+	_PageSize  = 1 << _PageShift
+	_PageMask  = _PageSize - 1
+)
+
+const (
+	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+	_64bit = 1 << (^uintptr(0) >> 63) / 2
+
+	// Computed constant.  The definition of MaxSmallSize and the
+	// algorithm in msize.c produce some number of different allocation
+	// size classes.  NumSizeClasses is that number.  It's needed here
+	// because there are static arrays of this length; when msize runs its
+	// size choosing algorithm it double-checks that NumSizeClasses agrees.
+	_NumSizeClasses = 67
+
+	// Tunable constants.
+	_MaxSmallSize = 32 << 10
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+	_TinySize      = 16
+	_TinySizeClass = 2
+
+	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+
+	// Per-P, per order stack segment cache size.
+	_StackCacheSize = 32 * 1024
+
+	// Number of orders that get caching.  Order 0 is FixedStack
+	// and each successive order is twice as large.
+	_NumStackOrders = 3
+
+	// Number of bits in page to span calculations (4k pages).
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+	_MHeapMap_TotalBits = (_64bit*_Windows)*35 + (_64bit*(1-_Windows))*37 + (1-_64bit)*32
+	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+
+	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine.  The garbage
+	// collector scales well to 32 cpus.
+	_MaxGcproc = 32
+)
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+type mlink struct {
+	next *mlink
+}
+
+// sysAlloc obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte.
+// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysUnused notifies the operating system that the contents
+// of the memory region are no longer needed and can be reused
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
+//
+// SysFree returns it unconditionally; this is only used if
+// an out-of-memory error has been detected midway through
+// an allocation.  It is okay if SysFree is a no-op.
+//
+// SysReserve reserves address space without allocating memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but SysReserve can still choose another
+// location if that one is unavailable.  On some systems and in some
+// cases SysReserve will simply check that the address space is
+// available and not actually reserve it.  If SysReserve returns
+// non-nil, it sets *reserved to true if the address space is
+// reserved, false if it has merely been checked.
+// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysMap maps previously reserved address space for use.
+// The reserved argument is true if the address space was really
+// reserved, not merely checked.
+//
+// SysFault marks a (already sysAlloc'd) region to fault
+// if accessed.  Used only for debugging the runtime.
+
+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+// MCache and MSpan objects.
+//
+// Memory returned by FixAlloc_Alloc is not zeroed.
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+type fixalloc struct {
+	size   uintptr
+	first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
+	arg    unsafe.Pointer
+	list   *mlink
+	chunk  *byte
+	nchunk uint32
+	inuse  uintptr // in-use bytes now
+	stat   *uint64
+}
+
+// Statistics.
+// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
+type mstats struct {
+	// General statistics.
+	alloc       uint64 // bytes allocated and still in use
+	total_alloc uint64 // bytes allocated (even if freed)
+	sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+	nlookup     uint64 // number of pointer lookups
+	nmalloc     uint64 // number of mallocs
+	nfree       uint64 // number of frees
+
+	// Statistics about malloc heap.
+	// protected by mheap.lock
+	heap_alloc    uint64 // bytes allocated and still in use
+	heap_sys      uint64 // bytes obtained from system
+	heap_idle     uint64 // bytes in idle spans
+	heap_inuse    uint64 // bytes in non-idle spans
+	heap_released uint64 // bytes released to the os
+	heap_objects  uint64 // total number of allocated objects
+
+	// Statistics about allocation of low-level fixed-size structures.
+	// Protected by FixAlloc locks.
+	stacks_inuse uint64 // this number is included in heap_inuse above
+	stacks_sys   uint64 // always 0 in mstats
+	mspan_inuse  uint64 // mspan structures
+	mspan_sys    uint64
+	mcache_inuse uint64 // mcache structures
+	mcache_sys   uint64
+	buckhash_sys uint64 // profiling bucket hash table
+	gc_sys       uint64
+	other_sys    uint64
+
+	// Statistics about garbage collector.
+	// Protected by mheap or stopping the world during GC.
+	next_gc        uint64 // next gc (in heap_alloc time)
+	last_gc        uint64 // last gc (in absolute time)
+	pause_total_ns uint64
+	pause_ns       [256]uint64 // circular buffer of recent gc pause lengths
+	pause_end      [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+	numgc          uint32
+	enablegc       bool
+	debuggc        bool
+
+	// Statistics about allocation size classes.
+
+	by_size [_NumSizeClasses]struct {
+		size    uint32
+		nmalloc uint64
+		nfree   uint64
+	}
+
+	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+}
+
+var memstats mstats
+
+// Size classes.  Computed and initialized by InitSizes.
+//
+// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+//	1 <= sizeclass < NumSizeClasses, for n.
+//	Size class 0 is reserved to mean "not small".
+//
+// class_to_size[i] = largest size in class i
+// class_to_allocnpages[i] = number of pages to allocate when
+//	making new objects in class i
+
+var class_to_size [_NumSizeClasses]int32
+var class_to_allocnpages [_NumSizeClasses]int32
+var size_to_class8 [1024/8 + 1]int8
+var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
+
+type mcachelist struct {
+	list  *mlink
+	nlist uint32
+}
+
+type stackfreelist struct {
+	list *mlink  // linked list of free stacks
+	size uintptr // total size of stacks in list
+}
+
+// Per-thread (in Go, per-P) cache for small objects.
+// No locking needed because it is per-thread (per-P).
+type mcache struct {
+	// The following members are accessed on every malloc,
+	// so they are grouped here for better caching.
+	next_sample      int32  // trigger heap sample after allocating this many bytes
+	local_cachealloc intptr // bytes allocated (or freed) from cache since last lock of heap
+	// Allocator cache for tiny objects w/o pointers.
+	// See "Tiny allocator" comment in malloc.goc.
+	tiny             *byte
+	tinysize         uintptr
+	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+
+	// The rest is not accessed on every malloc.
+	alloc [_NumSizeClasses]*mspan // spans to allocate from
+
+	stackcache [_NumStackOrders]stackfreelist
+
+	sudogcache *sudog
+
+	gcworkbuf unsafe.Pointer
+
+	// Local allocator stats, flushed during GC.
+	local_nlookup    uintptr                  // number of pointer lookups
+	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
+	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
+	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+}
+
+const (
+	_KindSpecialFinalizer = 1
+	_KindSpecialProfile   = 2
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+)
+
+type special struct {
+	next   *special // linked list in span
+	offset uint16   // span offset of object
+	kind   byte     // kind of special
+}
+
+// The described object has a finalizer set for it.
+type specialfinalizer struct {
+	special special
+	fn      *funcval
+	nret    uintptr
+	fint    *_type
+	ot      *ptrtype
+}
+
+// The described object is being heap profiled.
+type specialprofile struct {
+	special special
+	b       *bucket
+}
+
+// An MSpan is a run of pages.
+const (
+	_MSpanInUse = iota // allocated for garbage collected heap
+	_MSpanStack        // allocated for use by stack allocator
+	_MSpanFree
+	_MSpanListHead
+	_MSpanDead
+)
+
+type mspan struct {
+	next     *mspan  // in a span linked list
+	prev     *mspan  // in a span linked list
+	start    pageID  // starting page number
+	npages   uintptr // number of pages in span
+	freelist *mlink  // list of free objects
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+	sweepgen    uint32
+	ref         uint16   // capacity - number of objects in freelist
+	sizeclass   uint8    // size class
+	incache     bool     // being used by an mcache
+	state       uint8    // mspaninuse etc
+	needzero    uint8    // needs to be zeroed before allocation
+	elemsize    uintptr  // computed from sizeclass or from npages
+	unusedsince int64    // first time spotted by gc in mspanfree state
+	npreleased  uintptr  // number of pages released to the os
+	limit       uintptr  // end of data in span
+	speciallock mutex    // guards specials list
+	specials    *special // linked list of special records sorted by offset.
+}
+
+// Every MSpan is in one doubly-linked list,
+// either one of the MHeap's free lists or one of the
+// MCentral's span lists.  We use empty MSpan structures as list heads.
+
+// Central list of free objects of a given size.
+type mcentral struct {
+	lock      mutex
+	sizeclass int32
+	nonempty  mspan // list of spans with a free object
+	empty     mspan // list of spans with no free objects (or cached in an mcache)
+}
+
+// Main malloc heap.
+// The heap itself is the "free[]" and "large" arrays,
+// but all the other global data is here too.
+type mheap struct {
+	lock      mutex
+	free      [_MaxMHeapList]mspan // free lists of given length
+	freelarge mspan                // free lists length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
+	busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
+	allspans  **mspan              // all spans out there
+	gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
+	nspan     uint32
+	sweepgen  uint32 // sweep generation, see comment in mspan
+	sweepdone uint32 // all spans are swept
+
+	// span lookup
+	spans        **mspan
+	spans_mapped uintptr
+
+	// range of addresses we might see in the heap
+	bitmap         uintptr
+	bitmap_mapped  uintptr
+	arena_start    uintptr
+	arena_used     uintptr
+	arena_end      uintptr
+	arena_reserved bool
+
+	// central free lists for small size classes.
+	// the padding makes sure that the MCentrals are
+	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// gets its own cache line.
+	central [_NumSizeClasses]struct {
+		mcentral mcentral
+		pad      [_CacheLineSize]byte
+	}
+
+	spanalloc             fixalloc // allocator for span*
+	cachealloc            fixalloc // allocator for mcache*
+	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+	specialprofilealloc   fixalloc // allocator for specialprofile*
+	speciallock           mutex    // lock for sepcial record allocators.
+
+	// Malloc stats.
+	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+}
+
+var mheap_ mheap
+
+const (
+	// flags to malloc
+	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
+	_FlagNoZero = 1 << 1 // don't zero memory
+)
+
+// NOTE: Layout known to queuefinalizer.
+type finalizer struct {
+	fn   *funcval       // function to call
+	arg  unsafe.Pointer // ptr to object
+	nret uintptr        // bytes of return values from fn
+	fint *_type         // type of first argument of fn
+	ot   *ptrtype       // type of ptr to object
+}
+
+type finblock struct {
+	alllink *finblock
+	next    *finblock
+	cnt     int32
+	cap     int32
+	fin     [1]finalizer
+}
+
+// Information from the compiler about the layout of stack frames.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
+
+type stackmap struct {
+	n        int32   // number of bitmaps
+	nbit     int32   // number of bits in each bitmap
+	bytedata [0]byte // bitmaps, each starting on a 32-bit boundary
+}
+
+// Returns pointer map data for the given stackmap index
+// (the index is encoded in PCDATA_StackMapIndex).
+
+// defined in mgc0.go

diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c
deleted file mode 100644
index 5fdbe32..0000000
--- a/src/runtime/mcache.c
+++ /dev/null

@@ -1,115 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Per-P malloc cache for small objects.
-//
-// See malloc.h for an overview.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-
-extern volatile intgo runtime·MemProfileRate;
-
-// dummy MSpan that contains no free objects.
-MSpan runtime·emptymspan;
-
-MCache*
-runtime·allocmcache(void)
-{
-	intgo rate;
-	MCache *c;
-	int32 i;
-
-	runtime·lock(&runtime·mheap.lock);
-	c = runtime·FixAlloc_Alloc(&runtime·mheap.cachealloc);
-	runtime·unlock(&runtime·mheap.lock);
-	runtime·memclr((byte*)c, sizeof(*c));
-	for(i = 0; i < NumSizeClasses; i++)
-		c->alloc[i] = &runtime·emptymspan;
-
-	// Set first allocation sample size.
-	rate = runtime·MemProfileRate;
-	if(rate > 0x3fffffff)	// make 2*rate not overflow
-		rate = 0x3fffffff;
-	if(rate != 0)
-		c->next_sample = runtime·fastrand1() % (2*rate);
-
-	return c;
-}
-
-static void
-freemcache(MCache *c)
-{
-	runtime·MCache_ReleaseAll(c);
-	runtime·stackcache_clear(c);
-	runtime·gcworkbuffree(c->gcworkbuf);
-	runtime·lock(&runtime·mheap.lock);
-	runtime·purgecachedstats(c);
-	runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c);
-	runtime·unlock(&runtime·mheap.lock);
-}
-
-static void
-freemcache_m(void)
-{
-	MCache *c;
-
-	c = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-	freemcache(c);
-}
-
-void
-runtime·freemcache(MCache *c)
-{
-	void (*fn)(void);
-
-	g->m->ptrarg[0] = c;
-	fn = freemcache_m;
-	runtime·onM(&fn);
-}
-
-// Gets a span that has a free object in it and assigns it
-// to be the cached span for the given sizeclass.  Returns this span.
-MSpan*
-runtime·MCache_Refill(MCache *c, int32 sizeclass)
-{
-	MSpan *s;
-
-	g->m->locks++;
-	// Return the current cached span to the central lists.
-	s = c->alloc[sizeclass];
-	if(s->freelist != nil)
-		runtime·throw("refill on a nonempty span");
-	if(s != &runtime·emptymspan)
-		s->incache = false;
-
-	// Get a new cached span from the central lists.
-	s = runtime·MCentral_CacheSpan(&runtime·mheap.central[sizeclass].mcentral);
-	if(s == nil)
-		runtime·throw("out of memory");
-	if(s->freelist == nil) {
-		runtime·printf("%d %d\n", s->ref, (int32)((s->npages << PageShift) / s->elemsize));
-		runtime·throw("empty span");
-	}
-	c->alloc[sizeclass] = s;
-	g->m->locks--;
-	return s;
-}
-
-void
-runtime·MCache_ReleaseAll(MCache *c)
-{
-	int32 i;
-	MSpan *s;
-
-	for(i=0; i<NumSizeClasses; i++) {
-		s = c->alloc[i];
-		if(s != &runtime·emptymspan) {
-			runtime·MCentral_UncacheSpan(&runtime·mheap.central[i].mcentral, s);
-			c->alloc[i] = &runtime·emptymspan;
-		}
-	}
-}

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
new file mode 100644
index 0000000..7482bc0
--- /dev/null
+++ b/src/runtime/mcache.go

@@ -0,0 +1,86 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Per-P malloc cache for small objects.
+//
+// See malloc.h for an overview.
+
+package runtime
+
+import "unsafe"
+
+// dummy MSpan that contains no free objects.
+var emptymspan mspan
+
+func allocmcache() *mcache {
+	lock(&mheap_.lock)
+	c := (*mcache)(fixAlloc_Alloc(&mheap_.cachealloc))
+	unlock(&mheap_.lock)
+	memclr(unsafe.Pointer(c), unsafe.Sizeof(*c))
+	for i := 0; i < _NumSizeClasses; i++ {
+		c.alloc[i] = &emptymspan
+	}
+
+	// Set first allocation sample size.
+	rate := MemProfileRate
+	if rate > 0x3fffffff { // make 2*rate not overflow
+		rate = 0x3fffffff
+	}
+	if rate != 0 {
+		c.next_sample = int32(int(fastrand1()) % (2 * rate))
+	}
+
+	return c
+}
+
+func freemcache(c *mcache) {
+	onM(func() {
+		mCache_ReleaseAll(c)
+		stackcache_clear(c)
+		gcworkbuffree(c.gcworkbuf)
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
+		unlock(&mheap_.lock)
+	})
+}
+
+// Gets a span that has a free object in it and assigns it
+// to be the cached span for the given sizeclass.  Returns this span.
+func mCache_Refill(c *mcache, sizeclass int32) *mspan {
+	_g_ := getg()
+
+	_g_.m.locks++
+	// Return the current cached span to the central lists.
+	s := c.alloc[sizeclass]
+	if s.freelist != nil {
+		gothrow("refill on a nonempty span")
+	}
+	if s != &emptymspan {
+		s.incache = false
+	}
+
+	// Get a new cached span from the central lists.
+	s = mCentral_CacheSpan(&mheap_.central[sizeclass].mcentral)
+	if s == nil {
+		gothrow("out of memory")
+	}
+	if s.freelist == nil {
+		println(s.ref, (s.npages<<_PageShift)/s.elemsize)
+		gothrow("empty span")
+	}
+	c.alloc[sizeclass] = s
+	_g_.m.locks--
+	return s
+}
+
+func mCache_ReleaseAll(c *mcache) {
+	for i := 0; i < _NumSizeClasses; i++ {
+		s := c.alloc[i]
+		if s != &emptymspan {
+			mCentral_UncacheSpan(&mheap_.central[i].mcentral, s)
+			c.alloc[i] = &emptymspan
+		}
+	}
+}

diff --git a/src/runtime/mcentral.c b/src/runtime/mcentral.c
deleted file mode 100644
index fe6bcfe..0000000
--- a/src/runtime/mcentral.c
+++ /dev/null

@@ -1,214 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Central free lists.
-//
-// See malloc.h for an overview.
-//
-// The MCentral doesn't actually contain the list of free objects; the MSpan does.
-// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
-// and those that are completely allocated (c->empty).
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-
-static MSpan* MCentral_Grow(MCentral *c);
-
-// Initialize a single central free list.
-void
-runtime·MCentral_Init(MCentral *c, int32 sizeclass)
-{
-	c->sizeclass = sizeclass;
-	runtime·MSpanList_Init(&c->nonempty);
-	runtime·MSpanList_Init(&c->empty);
-}
-
-// Allocate a span to use in an MCache.
-MSpan*
-runtime·MCentral_CacheSpan(MCentral *c)
-{
-	MSpan *s;
-	int32 cap, n;
-	uint32 sg;
-
-	runtime·lock(&c->lock);
-	sg = runtime·mheap.sweepgen;
-retry:
-	for(s = c->nonempty.next; s != &c->nonempty; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
-			runtime·MSpanList_Remove(s);
-			runtime·MSpanList_InsertBack(&c->empty, s);
-			runtime·unlock(&c->lock);
-			runtime·MSpan_Sweep(s, true);
-			goto havespan;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being swept by background sweeper, skip
-			continue;
-		}
-		// we have a nonempty span that does not require sweeping, allocate from it
-		runtime·MSpanList_Remove(s);
-		runtime·MSpanList_InsertBack(&c->empty, s);
-		runtime·unlock(&c->lock);
-		goto havespan;
-	}
-
-	for(s = c->empty.next; s != &c->empty; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
-			// we have an empty span that requires sweeping,
-			// sweep it and see if we can free some space in it
-			runtime·MSpanList_Remove(s);
-			// swept spans are at the end of the list
-			runtime·MSpanList_InsertBack(&c->empty, s);
-			runtime·unlock(&c->lock);
-			runtime·MSpan_Sweep(s, true);
-			if(s->freelist != nil)
-				goto havespan;
-			runtime·lock(&c->lock);
-			// the span is still empty after sweep
-			// it is already in the empty list, so just retry
-			goto retry;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being swept by background sweeper, skip
-			continue;
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break;
-	}
-	runtime·unlock(&c->lock);
-
-	// Replenish central list if empty.
-	s = MCentral_Grow(c);
-	if(s == nil)
-		return nil;
-	runtime·lock(&c->lock);
-	runtime·MSpanList_InsertBack(&c->empty, s);
-	runtime·unlock(&c->lock);
-
-havespan:
-	// At this point s is a non-empty span, queued at the end of the empty list,
-	// c is unlocked.
-	cap = (s->npages << PageShift) / s->elemsize;
-	n = cap - s->ref;
-	if(n == 0)
-		runtime·throw("empty span");
-	if(s->freelist == nil)
-		runtime·throw("freelist empty");
-	s->incache = true;
-	return s;
-}
-
-// Return span from an MCache.
-void
-runtime·MCentral_UncacheSpan(MCentral *c, MSpan *s)
-{
-	int32 cap, n;
-
-	runtime·lock(&c->lock);
-
-	s->incache = false;
-
-	if(s->ref == 0)
-		runtime·throw("uncaching full span");
-
-	cap = (s->npages << PageShift) / s->elemsize;
-	n = cap - s->ref;
-	if(n > 0) {
-		runtime·MSpanList_Remove(s);
-		runtime·MSpanList_Insert(&c->nonempty, s);
-	}
-	runtime·unlock(&c->lock);
-}
-
-// Free n objects from a span s back into the central free list c.
-// Called during sweep.
-// Returns true if the span was returned to heap.  Sets sweepgen to
-// the latest generation.
-// If preserve=true, don't return the span to heap nor relink in MCentral lists;
-// caller takes care of it.
-bool
-runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end, bool preserve)
-{
-	bool wasempty;
-
-	if(s->incache)
-		runtime·throw("freespan into cached span");
-
-	// Add the objects back to s's free list.
-	wasempty = s->freelist == nil;
-	end->next = s->freelist;
-	s->freelist = start;
-	s->ref -= n;
-
-	if(preserve) {
-		// preserve is set only when called from MCentral_CacheSpan above,
-		// the span must be in the empty list.
-		if(s->next == nil)
-			runtime·throw("can't preserve unlinked span");
-		runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
-		return false;
-	}
-
-	runtime·lock(&c->lock);
-
-	// Move to nonempty if necessary.
-	if(wasempty) {
-		runtime·MSpanList_Remove(s);
-		runtime·MSpanList_Insert(&c->nonempty, s);
-	}
-
-	// delay updating sweepgen until here.  This is the signal that
-	// the span may be used in an MCache, so it must come after the
-	// linked list operations above (actually, just after the
-	// lock of c above.)
-	runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
-
-	if(s->ref != 0) {
-		runtime·unlock(&c->lock);
-		return false;
-	}
-
-	// s is completely freed, return it to the heap.
-	runtime·MSpanList_Remove(s);
-	s->needzero = 1;
-	s->freelist = nil;
-	runtime·unlock(&c->lock);
-	runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-	runtime·MHeap_Free(&runtime·mheap, s, 0);
-	return true;
-}
-
-// Fetch a new span from the heap and carve into objects for the free list.
-static MSpan*
-MCentral_Grow(MCentral *c)
-{
-	uintptr size, npages, i, n;
-	MLink **tailp, *v;
-	byte *p;
-	MSpan *s;
-
-	npages = runtime·class_to_allocnpages[c->sizeclass];
-	size = runtime·class_to_size[c->sizeclass];
-	n = (npages << PageShift) / size;
-	s = runtime·MHeap_Alloc(&runtime·mheap, npages, c->sizeclass, 0, 1);
-	if(s == nil)
-		return nil;
-
-	// Carve span into sequence of blocks.
-	tailp = &s->freelist;
-	p = (byte*)(s->start << PageShift);
-	s->limit = p + size*n;
-	for(i=0; i<n; i++) {
-		v = (MLink*)p;
-		*tailp = v;
-		tailp = &v->next;
-		p += size;
-	}
-	*tailp = nil;
-	runtime·markspan((byte*)(s->start<<PageShift), size, n, size*n < (s->npages<<PageShift));
-	return s;
-}

diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
new file mode 100644
index 0000000..0d172a0
--- /dev/null
+++ b/src/runtime/mcentral.go

@@ -0,0 +1,199 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Central free lists.
+//
+// See malloc.h for an overview.
+//
+// The MCentral doesn't actually contain the list of free objects; the MSpan does.
+// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
+// and those that are completely allocated (c->empty).
+
+package runtime
+
+import "unsafe"
+
+// Initialize a single central free list.
+func mCentral_Init(c *mcentral, sizeclass int32) {
+	c.sizeclass = sizeclass
+	mSpanList_Init(&c.nonempty)
+	mSpanList_Init(&c.empty)
+}
+
+// Allocate a span to use in an MCache.
+func mCentral_CacheSpan(c *mcentral) *mspan {
+	lock(&c.lock)
+	sg := mheap_.sweepgen
+retry:
+	var s *mspan
+	for s = c.nonempty.next; s != &c.nonempty; s = s.next {
+		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+			mSpanList_Remove(s)
+			mSpanList_InsertBack(&c.empty, s)
+			unlock(&c.lock)
+			mSpan_Sweep(s, true)
+			goto havespan
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// we have a nonempty span that does not require sweeping, allocate from it
+		mSpanList_Remove(s)
+		mSpanList_InsertBack(&c.empty, s)
+		unlock(&c.lock)
+		goto havespan
+	}
+
+	for s = c.empty.next; s != &c.empty; s = s.next {
+		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+			// we have an empty span that requires sweeping,
+			// sweep it and see if we can free some space in it
+			mSpanList_Remove(s)
+			// swept spans are at the end of the list
+			mSpanList_InsertBack(&c.empty, s)
+			unlock(&c.lock)
+			mSpan_Sweep(s, true)
+			if s.freelist != nil {
+				goto havespan
+			}
+			lock(&c.lock)
+			// the span is still empty after sweep
+			// it is already in the empty list, so just retry
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	unlock(&c.lock)
+
+	// Replenish central list if empty.
+	s = mCentral_Grow(c)
+	if s == nil {
+		return nil
+	}
+	lock(&c.lock)
+	mSpanList_InsertBack(&c.empty, s)
+	unlock(&c.lock)
+
+	// At this point s is a non-empty span, queued at the end of the empty list,
+	// c is unlocked.
+havespan:
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.ref)
+	if n == 0 {
+		gothrow("empty span")
+	}
+	if s.freelist == nil {
+		gothrow("freelist empty")
+	}
+	s.incache = true
+	return s
+}
+
+// Return span from an MCache.
+func mCentral_UncacheSpan(c *mcentral, s *mspan) {
+	lock(&c.lock)
+
+	s.incache = false
+
+	if s.ref == 0 {
+		gothrow("uncaching full span")
+	}
+
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.ref)
+	if n > 0 {
+		mSpanList_Remove(s)
+		mSpanList_Insert(&c.nonempty, s)
+	}
+	unlock(&c.lock)
+}
+
+// Free n objects from a span s back into the central free list c.
+// Called during sweep.
+// Returns true if the span was returned to heap.  Sets sweepgen to
+// the latest generation.
+// If preserve=true, don't return the span to heap nor relink in MCentral lists;
+// caller takes care of it.
+func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start *mlink, end *mlink, preserve bool) bool {
+	if s.incache {
+		gothrow("freespan into cached span")
+	}
+
+	// Add the objects back to s's free list.
+	wasempty := s.freelist == nil
+	end.next = s.freelist
+	s.freelist = start
+	s.ref -= uint16(n)
+
+	if preserve {
+		// preserve is set only when called from MCentral_CacheSpan above,
+		// the span must be in the empty list.
+		if s.next == nil {
+			gothrow("can't preserve unlinked span")
+		}
+		atomicstore(&s.sweepgen, mheap_.sweepgen)
+		return false
+	}
+
+	lock(&c.lock)
+
+	// Move to nonempty if necessary.
+	if wasempty {
+		mSpanList_Remove(s)
+		mSpanList_Insert(&c.nonempty, s)
+	}
+
+	// delay updating sweepgen until here.  This is the signal that
+	// the span may be used in an MCache, so it must come after the
+	// linked list operations above (actually, just after the
+	// lock of c above.)
+	atomicstore(&s.sweepgen, mheap_.sweepgen)
+
+	if s.ref != 0 {
+		unlock(&c.lock)
+		return false
+	}
+
+	// s is completely freed, return it to the heap.
+	mSpanList_Remove(s)
+	s.needzero = 1
+	s.freelist = nil
+	unlock(&c.lock)
+	unmarkspan(uintptr(s.start)<<_PageShift, s.npages<<_PageShift)
+	mHeap_Free(&mheap_, s, 0)
+	return true
+}
+
+// Fetch a new span from the heap and carve into objects for the free list.
+func mCentral_Grow(c *mcentral) *mspan {
+	npages := uintptr(class_to_allocnpages[c.sizeclass])
+	size := uintptr(class_to_size[c.sizeclass])
+	n := (npages << _PageShift) / size
+
+	s := mHeap_Alloc(&mheap_, npages, c.sizeclass, false, true)
+	if s == nil {
+		return nil
+	}
+
+	// Carve span into sequence of blocks.
+	tailp := &s.freelist
+	p := uintptr(s.start << _PageShift)
+	s.limit = p + size*n
+	for i := uintptr(0); i < n; i++ {
+		v := (*mlink)(unsafe.Pointer(p))
+		*tailp = v
+		tailp = &v.next
+		p += size
+	}
+	*tailp = nil
+	markspan(unsafe.Pointer(uintptr(s.start)<<_PageShift), size, n, size*n < s.npages<<_PageShift)
+	return s
+}

diff --git a/src/runtime/mem.go b/src/runtime/mem.go
index e6f1eb0..6bd250d 100644
--- a/src/runtime/mem.go
+++ b/src/runtime/mem.go

@@ -59,7 +59,11 @@
 	}
 }
 
-var sizeof_C_MStats uintptr // filled in by malloc.goc
+// Size of the trailing by_size array differs between Go and C,
+// and all data after by_size is local to runtime, not exported.
+// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+// sizeof_C_MStats is what C thinks about size of Go struct.
+var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
 
 func init() {
 	var memStats MemStats

diff --git a/src/runtime/mem_darwin.c b/src/runtime/mem_darwin.c
deleted file mode 100644
index bf3ede5..0000000
--- a/src/runtime/mem_darwin.c
+++ /dev/null

@@ -1,82 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "defs_GOOS_GOARCH.h"
-#include "os_GOOS.h"
-#include "malloc.h"
-#include "textflag.h"
-
-#pragma textflag NOSPLIT
-void*
-runtime·sysAlloc(uintptr n, uint64 *stat)
-{
-	void *v;
-
-	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
-	if(v < (void*)4096)
-		return nil;
-	runtime·xadd64(stat, n);
-	return v;
-}
-
-void
-runtime·SysUnused(void *v, uintptr n)
-{
-	// Linux's MADV_DONTNEED is like BSD's MADV_FREE.
-	runtime·madvise(v, n, MADV_FREE);
-}
-
-void
-runtime·SysUsed(void *v, uintptr n)
-{
-	USED(v);
-	USED(n);
-}
-
-void
-runtime·SysFree(void *v, uintptr n, uint64 *stat)
-{
-	runtime·xadd64(stat, -(uint64)n);
-	runtime·munmap(v, n);
-}
-
-void
-runtime·SysFault(void *v, uintptr n)
-{
-	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
-}
-
-void*
-runtime·SysReserve(void *v, uintptr n, bool *reserved)
-{
-	void *p;
-
-	*reserved = true;
-	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
-	if(p < (void*)4096)
-		return nil;
-	return p;
-}
-
-enum
-{
-	ENOMEM = 12,
-};
-
-void
-runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
-{
-	void *p;
-	
-	USED(reserved);
-
-	runtime·xadd64(stat, n);
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
-	if(p == (void*)ENOMEM)
-		runtime·throw("runtime: out of memory");
-	if(p != v)
-		runtime·throw("runtime: cannot map pages in arena address space");
-}

diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
new file mode 100644
index 0000000..1bee933
--- /dev/null
+++ b/src/runtime/mem_darwin.go

@@ -0,0 +1,58 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+//go:nosplit
+func sysAlloc(n uintptr, stat *uint64) unsafe.Pointer {
+	v := (unsafe.Pointer)(mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	if uintptr(v) < 4096 {
+		return nil
+	}
+	xadd64(stat, int64(n))
+	return v
+}
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	// Linux's MADV_DONTNEED is like BSD's MADV_FREE.
+	madvise(v, n, _MADV_FREE)
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+}
+
+func sysFree(v unsafe.Pointer, n uintptr, stat *uint64) {
+	xadd64(stat, -int64(n))
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
+	*reserved = true
+	p := (unsafe.Pointer)(mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	if uintptr(p) < 4096 {
+		return nil
+	}
+	return p
+}
+
+const (
+	_ENOMEM = 12
+)
+
+func sysMap(v unsafe.Pointer, n uintptr, reserved bool, stat *uint64) {
+	xadd64(stat, int64(n))
+	p := (unsafe.Pointer)(mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0))
+	if uintptr(p) == _ENOMEM {
+		gothrow("runtime: out of memory")
+	}
+	if p != v {
+		gothrow("runtime: cannot map pages in arena address space")
+	}
+}

diff --git a/src/runtime/mem_linux.c b/src/runtime/mem_linux.c
deleted file mode 100644
index bfb4056..0000000
--- a/src/runtime/mem_linux.c
+++ /dev/null

@@ -1,162 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "defs_GOOS_GOARCH.h"
-#include "os_GOOS.h"
-#include "malloc.h"
-#include "textflag.h"
-
-enum
-{
-	_PAGE_SIZE = 4096,
-	EACCES = 13,
-};
-
-static int32
-addrspace_free(void *v, uintptr n)
-{
-	int32 errval;
-	uintptr chunk;
-	uintptr off;
-	
-	// NOTE: vec must be just 1 byte long here.
-	// Mincore returns ENOMEM if any of the pages are unmapped,
-	// but we want to know that all of the pages are unmapped.
-	// To make these the same, we can only ask about one page
-	// at a time. See golang.org/issue/7476.
-	static byte vec[1];
-
-	for(off = 0; off < n; off += chunk) {
-		chunk = _PAGE_SIZE * sizeof vec;
-		if(chunk > (n - off))
-			chunk = n - off;
-		errval = runtime·mincore((int8*)v + off, chunk, vec);
-		// ENOMEM means unmapped, which is what we want.
-		// Anything else we assume means the pages are mapped.
-		if (errval != -ENOMEM)
-			return 0;
-	}
-	return 1;
-}
-
-static void *
-mmap_fixed(byte *v, uintptr n, int32 prot, int32 flags, int32 fd, uint32 offset)
-{
-	void *p;
-
-	p = runtime·mmap(v, n, prot, flags, fd, offset);
-	if(p != v && addrspace_free(v, n)) {
-		// On some systems, mmap ignores v without
-		// MAP_FIXED, so retry if the address space is free.
-		if(p > (void*)4096)
-			runtime·munmap(p, n);
-		p = runtime·mmap(v, n, prot, flags|MAP_FIXED, fd, offset);
-	}
-	return p;
-}
-
-#pragma textflag NOSPLIT
-void*
-runtime·sysAlloc(uintptr n, uint64 *stat)
-{
-	void *p;
-
-	p = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
-	if(p < (void*)4096) {
-		if(p == (void*)EACCES) {
-			runtime·printf("runtime: mmap: access denied\n");
-			runtime·printf("if you're running SELinux, enable execmem for this process.\n");
-			runtime·exit(2);
-		}
-		if(p == (void*)EAGAIN) {
-			runtime·printf("runtime: mmap: too much locked memory (check 'ulimit -l').\n");
-			runtime·exit(2);
-		}
-		return nil;
-	}
-	runtime·xadd64(stat, n);
-	return p;
-}
-
-void
-runtime·SysUnused(void *v, uintptr n)
-{
-	runtime·madvise(v, n, MADV_DONTNEED);
-}
-
-void
-runtime·SysUsed(void *v, uintptr n)
-{
-	USED(v);
-	USED(n);
-}
-
-void
-runtime·SysFree(void *v, uintptr n, uint64 *stat)
-{
-	runtime·xadd64(stat, -(uint64)n);
-	runtime·munmap(v, n);
-}
-
-void
-runtime·SysFault(void *v, uintptr n)
-{
-	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
-}
-
-void*
-runtime·SysReserve(void *v, uintptr n, bool *reserved)
-{
-	void *p;
-
-	// On 64-bit, people with ulimit -v set complain if we reserve too
-	// much address space.  Instead, assume that the reservation is okay
-	// if we can reserve at least 64K and check the assumption in SysMap.
-	// Only user-mode Linux (UML) rejects these requests.
-	if(sizeof(void*) == 8 && n > 1LL<<32) {
-		p = mmap_fixed(v, 64<<10, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
-		if (p != v) {
-			if(p >= (void*)4096)
-				runtime·munmap(p, 64<<10);
-			return nil;
-		}
-		runtime·munmap(p, 64<<10);
-		*reserved = false;
-		return v;
-	}
-
-	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
-	if((uintptr)p < 4096)
-		return nil;
-	*reserved = true;
-	return p;
-}
-
-void
-runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
-{
-	void *p;
-	
-	runtime·xadd64(stat, n);
-
-	// On 64-bit, we don't actually have v reserved, so tread carefully.
-	if(!reserved) {
-		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
-		if(p == (void*)ENOMEM)
-			runtime·throw("runtime: out of memory");
-		if(p != v) {
-			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
-			runtime·throw("runtime: address space conflict");
-		}
-		return;
-	}
-
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
-	if(p == (void*)ENOMEM)
-		runtime·throw("runtime: out of memory");
-	if(p != v)
-		runtime·throw("runtime: cannot map pages in arena address space");
-}

diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
new file mode 100644
index 0000000..0ef6eea
--- /dev/null
+++ b/src/runtime/mem_linux.go

@@ -0,0 +1,135 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+const (
+	_PAGE_SIZE = 4096
+	_EACCES    = 13
+)
+
+// NOTE: vec must be just 1 byte long here.
+// Mincore returns ENOMEM if any of the pages are unmapped,
+// but we want to know that all of the pages are unmapped.
+// To make these the same, we can only ask about one page
+// at a time. See golang.org/issue/7476.
+var addrspace_vec [1]byte
+
+func addrspace_free(v unsafe.Pointer, n uintptr) bool {
+	var chunk uintptr
+	for off := uintptr(0); off < n; off += chunk {
+		chunk = _PAGE_SIZE * uintptr(len(addrspace_vec))
+		if chunk > (n - off) {
+			chunk = n - off
+		}
+		errval := mincore(unsafe.Pointer(uintptr(v)+off), chunk, &addrspace_vec[0])
+		// ENOMEM means unmapped, which is what we want.
+		// Anything else we assume means the pages are mapped.
+		if errval != -_ENOMEM {
+			return false
+		}
+	}
+	return true
+}
+
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uint32) unsafe.Pointer {
+	p := mmap(v, n, prot, flags, fd, offset)
+	if p != v && addrspace_free(v, n) {
+		// On some systems, mmap ignores v without
+		// MAP_FIXED, so retry if the address space is free.
+		if uintptr(p) > 4096 {
+			munmap(p, n)
+		}
+		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+	}
+	return p
+}
+
+//go:nosplit
+func sysAlloc(n uintptr, stat *uint64) unsafe.Pointer {
+	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if uintptr(p) < 4096 {
+		if uintptr(p) == _EACCES {
+			print("runtime: mmap: access denied\n")
+			print("if you're running SELinux, enable execmem for this process.\n")
+			exit(2)
+		}
+		if uintptr(p) == _EAGAIN {
+			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
+			exit(2)
+		}
+		return nil
+	}
+	xadd64(stat, int64(n))
+	return p
+}
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	madvise(v, n, _MADV_DONTNEED)
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+}
+
+func sysFree(v unsafe.Pointer, n uintptr, stat *uint64) {
+	xadd64(stat, -int64(n))
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// if we can reserve at least 64K and check the assumption in SysMap.
+	// Only user-mode Linux (UML) rejects these requests.
+	if ptrSize == 7 && uint64(n) > 1<<32 {
+		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if p != v {
+			if uintptr(p) >= 4096 {
+				munmap(p, 64<<10)
+			}
+			return nil
+		}
+		munmap(p, 64<<10)
+		*reserved = false
+		return v
+	}
+
+	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if uintptr(p) < 4096 {
+		return nil
+	}
+	*reserved = true
+	return p
+}
+
+func sysMap(v unsafe.Pointer, n uintptr, reserved bool, stat *uint64) {
+	xadd64(stat, int64(n))
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if !reserved {
+		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if uintptr(p) == _ENOMEM {
+			gothrow("runtime: out of memory")
+		}
+		if p != v {
+			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+			gothrow("runtime: address space conflict")
+		}
+		return
+	}
+
+	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if uintptr(p) == _ENOMEM {
+		gothrow("runtime: out of memory")
+	}
+	if p != v {
+		gothrow("runtime: cannot map pages in arena address space")
+	}
+}

diff --git a/src/runtime/mfixalloc.c b/src/runtime/mfixalloc.c
deleted file mode 100644
index d670629..0000000
--- a/src/runtime/mfixalloc.c
+++ /dev/null

@@ -1,64 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Fixed-size object allocator.  Returned memory is not zeroed.
-//
-// See malloc.h for overview.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-
-// Initialize f to allocate objects of the given size,
-// using the allocator to obtain chunks of memory.
-void
-runtime·FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat)
-{
-	f->size = size;
-	f->first = first;
-	f->arg = arg;
-	f->list = nil;
-	f->chunk = nil;
-	f->nchunk = 0;
-	f->inuse = 0;
-	f->stat = stat;
-}
-
-void*
-runtime·FixAlloc_Alloc(FixAlloc *f)
-{
-	void *v;
-	
-	if(f->size == 0) {
-		runtime·printf("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n");
-		runtime·throw("runtime: internal error");
-	}
-
-	if(f->list) {
-		v = f->list;
-		f->list = *(void**)f->list;
-		f->inuse += f->size;
-		return v;
-	}
-	if(f->nchunk < f->size) {
-		f->chunk = runtime·persistentalloc(FixAllocChunk, 0, f->stat);
-		f->nchunk = FixAllocChunk;
-	}
-	v = f->chunk;
-	if(f->first)
-		f->first(f->arg, v);
-	f->chunk += f->size;
-	f->nchunk -= f->size;
-	f->inuse += f->size;
-	return v;
-}
-
-void
-runtime·FixAlloc_Free(FixAlloc *f, void *p)
-{
-	f->inuse -= f->size;
-	*(void**)p = f->list;
-	f->list = p;
-}
-

diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
new file mode 100644
index 0000000..b66a17e
--- /dev/null
+++ b/src/runtime/mfixalloc.go

@@ -0,0 +1,59 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fixed-size object allocator.  Returned memory is not zeroed.
+//
+// See malloc.h for overview.
+
+package runtime
+
+import "unsafe"
+
+// Initialize f to allocate objects of the given size,
+// using the allocator to obtain chunks of memory.
+func fixAlloc_Init(f *fixalloc, size uintptr, first func(unsafe.Pointer, unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+	f.size = size
+	f.first = *(*unsafe.Pointer)(unsafe.Pointer(&first))
+	f.arg = arg
+	f.list = nil
+	f.chunk = nil
+	f.nchunk = 0
+	f.inuse = 0
+	f.stat = stat
+}
+
+func fixAlloc_Alloc(f *fixalloc) unsafe.Pointer {
+	if f.size == 0 {
+		print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
+		gothrow("runtime: internal error")
+	}
+
+	if f.list != nil {
+		v := unsafe.Pointer(f.list)
+		f.list = f.list.next
+		f.inuse += f.size
+		return v
+	}
+	if uintptr(f.nchunk) < f.size {
+		f.chunk = (*uint8)(persistentalloc(_FixAllocChunk, 0, f.stat))
+		f.nchunk = _FixAllocChunk
+	}
+
+	v := (unsafe.Pointer)(f.chunk)
+	if f.first != nil {
+		fn := *(*func(unsafe.Pointer, unsafe.Pointer))(unsafe.Pointer(&f.first))
+		fn(f.arg, v)
+	}
+	f.chunk = (*byte)(add(unsafe.Pointer(f.chunk), f.size))
+	f.nchunk -= uint32(f.size)
+	f.inuse += f.size
+	return v
+}
+
+func fixAlloc_Free(f *fixalloc, p unsafe.Pointer) {
+	f.inuse -= f.size
+	v := (*mlink)(p)
+	v.next = f.list
+	f.list = v
+}

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
new file mode 100644
index 0000000..569bf5d
--- /dev/null
+++ b/src/runtime/mgc.go

@@ -0,0 +1,1827 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(rsc): The code having to do with the heap bitmap needs very serious cleanup.
+// It has gotten completely out of control.
+
+// Garbage collector (GC).
+//
+// GC is:
+// - mark&sweep
+// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
+// - parallel (up to MaxGcproc threads)
+// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
+// - non-moving/non-compacting
+// - full (non-partial)
+//
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+//
+// Concurrent sweep.
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+// and so next_gc calculation is tricky and happens as follows.
+// At the end of the stop-the-world phase next_gc is conservatively set based on total
+// heap size; all spans are marked as "needs sweeping".
+// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+// closer to the target value. However, this is not enough to avoid over-allocating memory.
+// Consider that a goroutine wants to allocate a new span for a large object and
+// there are no free swept spans, but there are small-object unswept spans.
+// If the goroutine naively allocates a new span, it can surpass the yet-unknown
+// target next_gc value. In order to prevent such cases (1) when a goroutine needs
+// to allocate a new small-object span, it sweeps small-object spans for the same
+// object size until it frees at least one object; (2) when a goroutine needs to
+// allocate large-object span from heap, it sweeps spans until it frees at least
+// that many pages into heap. Together these two measures ensure that we don't surpass
+// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+// but there can still be other one-page unswept spans which could be combined into a two-page span.
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
+
+package runtime
+
+import "unsafe"
+
+const (
+	_DebugGC         = 0
+	_DebugGCPtrs     = false // if true, print trace of every pointer load during GC
+	_ConcurrentSweep = true
+
+	_WorkbufSize     = 4 * 1024
+	_FinBlockSize    = 4 * 1024
+	_RootData        = 0
+	_RootBss         = 1
+	_RootFinalizers  = 2
+	_RootSpans       = 3
+	_RootFlushCaches = 4
+	_RootCount       = 5
+)
+
+// ptrmask for an allocation containing a single pointer.
+var oneptr = [...]uint8{bitsPointer}
+
+// Initialized from $GOGC.  GOGC=off means no gc.
+var gcpercent int32
+
+// Holding worldsema grants an M the right to try to stop the world.
+// The procedure is:
+//
+//	semacquire(&worldsema);
+//	m.gcing = 1;
+//	stoptheworld();
+//
+//	... do stuff ...
+//
+//	m.gcing = 0;
+//	semrelease(&worldsema);
+//	starttheworld();
+//
+var worldsema uint32 = 1
+
+type workbuf struct {
+	node lfnode // must be first
+	nobj uintptr
+	obj  [(_WorkbufSize - unsafe.Sizeof(lfnode{}) - ptrSize) / ptrSize]uintptr
+}
+
+var data, edata, bss, ebss, gcdata, gcbss struct{}
+
+var finlock mutex  // protects the following variables
+var fing *g        // goroutine that runs finalizers
+var finq *finblock // list of finalizers that are to be executed
+var finc *finblock // cache of free blocks
+var finptrmask [_FinBlockSize / ptrSize / pointersPerByte]byte
+var fingwait bool
+var fingwake bool
+var allfin *finblock // list of all blocks
+
+var gcdatamask bitvector
+var gcbssmask bitvector
+
+var gclock mutex
+
+var badblock [1024]uintptr
+var nbadblock int32
+
+type workdata struct {
+	full    uint64                // lock-free list of full blocks
+	empty   uint64                // lock-free list of empty blocks
+	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+	nproc   uint32
+	tstart  int64
+	nwait   uint32
+	ndone   uint32
+	alldone note
+	markfor *parfor
+
+	// Copy of mheap.allspans for marker or sweeper.
+	spans []*mspan
+}
+
+var work workdata
+
+//go:linkname weak_cgo_allocate go.weak.runtime._cgo_allocate_internal
+var weak_cgo_allocate byte
+
+// Is _cgo_allocate linked into the binary?
+func have_cgo_allocate() bool {
+	return &weak_cgo_allocate != nil
+}
+
+// scanblock scans a block of n bytes starting at pointer b for references
+// to other objects, scanning any it finds recursively until there are no
+// unscanned objects left.  Instead of using an explicit recursion, it keeps
+// a work list in the Workbuf* structures and loops in the main function
+// body.  Keeping an explicit work list is easier on the stack allocator and
+// more efficient.
+func scanblock(b, n uintptr, ptrmask *uint8) {
+	// Cache memory arena parameters in local vars.
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	wbuf := getempty(nil)
+	nobj := wbuf.nobj
+	wp := &wbuf.obj[nobj]
+	keepworking := b == 0
+
+	var ptrbitp unsafe.Pointer
+
+	// ptrmask can have 2 possible values:
+	// 1. nil - obtain pointer mask from GC bitmap.
+	// 2. pointer to a compact mask (for stacks and data).
+	goto_scanobj := b != 0
+
+	for {
+		if goto_scanobj {
+			goto_scanobj = false
+		} else {
+			if nobj == 0 {
+				// Out of work in workbuf.
+				if !keepworking {
+					putempty(wbuf)
+					return
+				}
+
+				// Refill workbuf from global queue.
+				wbuf = getfull(wbuf)
+				if wbuf == nil {
+					return
+				}
+				nobj = wbuf.nobj
+				if nobj < uintptr(len(wbuf.obj)) {
+					wp = &wbuf.obj[nobj]
+				} else {
+					wp = nil
+				}
+			}
+
+			// If another proc wants a pointer, give it some.
+			if work.nwait > 0 && nobj > 4 && work.full == 0 {
+				wbuf.nobj = nobj
+				wbuf = handoff(wbuf)
+				nobj = wbuf.nobj
+				if nobj < uintptr(len(wbuf.obj)) {
+					wp = &wbuf.obj[nobj]
+				} else {
+					wp = nil
+				}
+			}
+
+			nobj--
+			wp = &wbuf.obj[nobj]
+			b = *wp
+			n = arena_used - uintptr(b)
+			ptrmask = nil // use GC bitmap for pointer info
+		}
+
+		if _DebugGCPtrs {
+			print("scanblock ", b, " +", hex(n), " ", ptrmask, "\n")
+		}
+
+		// Find bits of the beginning of the object.
+		if ptrmask == nil {
+			off := (uintptr(b) - arena_start) / ptrSize
+			ptrbitp = unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1)
+		}
+
+		var i uintptr
+		for i = 0; i < n; i += ptrSize {
+			// Find bits for this word.
+			var bits uintptr
+			if ptrmask == nil {
+				// Check if we have reached end of span.
+				if (uintptr(b)+i)%_PageSize == 0 &&
+					h_spans[(uintptr(b)-arena_start)>>_PageShift] != h_spans[(uintptr(b)+i-arena_start)>>_PageShift] {
+					break
+				}
+
+				// Consult GC bitmap.
+				bits = uintptr(*(*byte)(ptrbitp))
+
+				if wordsPerBitmapByte != 2 {
+					gothrow("alg doesn't work for wordsPerBitmapByte != 2")
+				}
+				j := (uintptr(b) + i) / ptrSize & 1
+				ptrbitp = add(ptrbitp, -j)
+				bits >>= gcBits * j
+
+				if bits&bitBoundary != 0 && i != 0 {
+					break // reached beginning of the next object
+				}
+				bits = (bits >> 2) & bitsMask
+				if bits == bitsDead {
+					break // reached no-scan part of the object
+				}
+			} else {
+				// dense mask (stack or data)
+				bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
+			}
+
+			if bits <= _BitsScalar { // BitsScalar || BitsDead
+				continue
+			}
+
+			if bits != _BitsPointer {
+				gothrow("unexpected garbage collection bits")
+			}
+
+			obj := *(*uintptr)(unsafe.Pointer(b + i))
+			obj0 := obj
+
+		markobj:
+			var s *mspan
+			var off, bitp, shift, xbits uintptr
+
+			// At this point we have extracted the next potential pointer.
+			// Check if it points into heap.
+			if obj == 0 {
+				continue
+			}
+			if obj < arena_start || arena_used <= obj {
+				if uintptr(obj) < _PhysPageSize && invalidptr != 0 {
+					s = nil
+					goto badobj
+				}
+				continue
+			}
+
+			// Mark the object.
+			obj &^= ptrSize - 1
+			off = (obj - arena_start) / ptrSize
+			bitp = arena_start - off/wordsPerBitmapByte - 1
+			shift = (off % wordsPerBitmapByte) * gcBits
+			xbits = uintptr(*(*byte)(unsafe.Pointer(bitp)))
+			bits = (xbits >> shift) & bitMask
+			if (bits & bitBoundary) == 0 {
+				// Not a beginning of a block, consult span table to find the block beginning.
+				k := pageID(obj >> _PageShift)
+				x := k
+				x -= pageID(arena_start >> _PageShift)
+				s = h_spans[x]
+				if s == nil || k < s.start || s.limit <= obj || s.state != mSpanInUse {
+					// Stack pointers lie within the arena bounds but are not part of the GC heap.
+					// Ignore them.
+					if s != nil && s.state == _MSpanStack {
+						continue
+					}
+					goto badobj
+				}
+				p := uintptr(s.start) << _PageShift
+				if s.sizeclass != 0 {
+					size := s.elemsize
+					idx := (obj - p) / size
+					p = p + idx*size
+				}
+				if p == obj {
+					print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
+					gothrow("failed to find block beginning")
+				}
+				obj = p
+				goto markobj
+			}
+
+			if _DebugGCPtrs {
+				print("scan *", hex(b+i), " = ", hex(obj0), " => base ", hex(obj), "\n")
+			}
+
+			if nbadblock > 0 && obj == badblock[nbadblock-1] {
+				// Running garbage collection again because
+				// we want to find the path from a root to a bad pointer.
+				// Found possible next step; extend or finish path.
+				for j := int32(0); j < nbadblock; j++ {
+					if badblock[j] == b {
+						goto AlreadyBad
+					}
+				}
+				print("runtime: found *(", hex(b), "+", hex(i), ") = ", hex(obj0), "+", hex(obj-obj0), "\n")
+				if ptrmask != nil {
+					gothrow("bad pointer")
+				}
+				if nbadblock >= int32(len(badblock)) {
+					gothrow("badblock trace too long")
+				}
+				badblock[nbadblock] = uintptr(b)
+				nbadblock++
+			AlreadyBad:
+			}
+
+			// Now we have bits, bitp, and shift correct for
+			// obj pointing at the base of the object.
+			// Only care about not marked objects.
+			if bits&bitMarked != 0 {
+				continue
+			}
+
+			// If obj size is greater than 8, then each byte of GC bitmap
+			// contains info for at most one object. In such case we use
+			// non-atomic byte store to mark the object. This can lead
+			// to double enqueue of the object for scanning, but scanning
+			// is an idempotent operation, so it is OK. This cannot lead
+			// to bitmap corruption because the single marked bit is the
+			// only thing that can change in the byte.
+			// For 8-byte objects we use non-atomic store, if the other
+			// quadruple is already marked. Otherwise we resort to CAS
+			// loop for marking.
+			if xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
+				*(*byte)(unsafe.Pointer(bitp)) = uint8(xbits | bitMarked<<shift)
+			} else {
+				atomicor8((*byte)(unsafe.Pointer(bitp)), bitMarked<<shift)
+			}
+
+			if (xbits>>(shift+2))&bitsMask == bitsDead {
+				continue // noscan object
+			}
+
+			// Queue the obj for scanning.
+			// TODO: PREFETCH here.
+
+			// If workbuf is full, obtain an empty one.
+			if nobj >= uintptr(len(wbuf.obj)) {
+				wbuf.nobj = nobj
+				wbuf = getempty(wbuf)
+				nobj = wbuf.nobj
+				wp = &wbuf.obj[nobj]
+			}
+			*wp = obj
+			nobj++
+			if nobj < uintptr(len(wbuf.obj)) {
+				wp = &wbuf.obj[nobj]
+			} else {
+				wp = nil
+			}
+			continue
+
+		badobj:
+			// If cgo_allocate is linked into the binary, it can allocate
+			// memory as []unsafe.Pointer that may not contain actual
+			// pointers and must be scanned conservatively.
+			// In this case alone, allow the bad pointer.
+			if have_cgo_allocate() && ptrmask == nil {
+				continue
+			}
+
+			// Anything else indicates a bug somewhere.
+			// If we're in the middle of chasing down a different bad pointer,
+			// don't confuse the trace by printing about this one.
+			if nbadblock > 0 {
+				continue
+			}
+
+			print("runtime: garbage collector found invalid heap pointer *(", hex(b), "+", hex(i), ")=", hex(obj))
+			if s == nil {
+				print(" s=nil\n")
+			} else {
+				print(" span=", uintptr(s.start)<<_PageShift, "-", s.limit, "-", (uintptr(s.start)+s.npages)<<_PageShift, " state=", s.state, "\n")
+			}
+			if ptrmask != nil {
+				gothrow("invalid heap pointer")
+			}
+			// Add to badblock list, which will cause the garbage collection
+			// to keep repeating until it has traced the chain of pointers
+			// leading to obj all the way back to a root.
+			if nbadblock == 0 {
+				badblock[nbadblock] = uintptr(b)
+				nbadblock++
+			}
+		}
+		if _DebugGCPtrs {
+			print("end scanblock ", hex(b), " +", hex(n), " ", ptrmask, "\n")
+		}
+		if _DebugGC > 0 && ptrmask == nil {
+			// For heap objects ensure that we did not overscan.
+			var p, n uintptr
+			if mlookup(b, &p, &n, nil) == 0 || b != p || i > n {
+				print("runtime: scanned (", hex(b), "+", hex(i), "), heap object (", hex(p), "+", hex(n), ")\n")
+				gothrow("scanblock: scanned invalid object")
+			}
+		}
+	}
+}
+
+func markroot(desc *parfor, i uint32) {
+	// Note: if you add a case here, please also update heapdump.c:dumproots.
+	switch i {
+	case _RootData:
+		scanblock(uintptr(unsafe.Pointer(&data)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)), gcdatamask.bytedata)
+
+	case _RootBss:
+		scanblock(uintptr(unsafe.Pointer(&bss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)), gcbssmask.bytedata)
+
+	case _RootFinalizers:
+		for fb := allfin; fb != nil; fb = fb.alllink {
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0])
+		}
+
+	case _RootSpans:
+		// mark MSpan.specials
+		sg := mheap_.sweepgen
+		for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
+			s := work.spans[spanidx]
+			if s.state != mSpanInUse {
+				continue
+			}
+			if s.sweepgen != sg {
+				print("sweep ", s.sweepgen, " ", sg, "\n")
+				gothrow("gc: unswept span")
+			}
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+				scanblock(p, s.elemsize, nil)
+				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0])
+			}
+		}
+
+	case _RootFlushCaches:
+		flushallmcaches()
+
+	default:
+		// the rest is scanning goroutine stacks
+		if uintptr(i-_RootCount) >= allglen {
+			gothrow("markroot: bad index")
+		}
+		gp := allgs[i-_RootCount]
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status := readgstatus(gp)
+		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+			gp.waitsince = work.tstart
+		}
+		// Shrink a stack if not much of it is being used.
+		shrinkstack(gp)
+		if readgstatus(gp) == _Gdead {
+			gp.gcworkdone = true
+		} else {
+			gp.gcworkdone = false
+		}
+		restart := stopg(gp)
+		scanstack(gp)
+		if restart {
+			restartg(gp)
+		}
+	}
+}
+
+// Get an empty work buffer off the work.empty list,
+// allocating new buffers as needed.
+func getempty(b *workbuf) *workbuf {
+	_g_ := getg()
+	if b != nil {
+		lfstackpush(&work.full, &b.node)
+	}
+	b = nil
+	c := _g_.m.mcache
+	if c.gcworkbuf != nil {
+		b = (*workbuf)(c.gcworkbuf)
+		c.gcworkbuf = nil
+	}
+	if b == nil {
+		b = (*workbuf)(lfstackpop(&work.empty))
+	}
+	if b == nil {
+		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
+	}
+	b.nobj = 0
+	return b
+}
+
+func putempty(b *workbuf) {
+	_g_ := getg()
+	c := _g_.m.mcache
+	if c.gcworkbuf == nil {
+		c.gcworkbuf = (unsafe.Pointer)(b)
+		return
+	}
+	lfstackpush(&work.empty, &b.node)
+}
+
+func gcworkbuffree(b unsafe.Pointer) {
+	if b != nil {
+		putempty((*workbuf)(b))
+	}
+}
+
+// Get a full work buffer off the work.full list, or return nil.
+func getfull(b *workbuf) *workbuf {
+	if b != nil {
+		lfstackpush(&work.empty, &b.node)
+	}
+	b = (*workbuf)(lfstackpop(&work.full))
+	if b != nil || work.nproc == 1 {
+		return b
+	}
+
+	xadd(&work.nwait, +1)
+	for i := 0; ; i++ {
+		if work.full != 0 {
+			xadd(&work.nwait, -1)
+			b = (*workbuf)(lfstackpop(&work.full))
+			if b != nil {
+				return b
+			}
+			xadd(&work.nwait, +1)
+		}
+		if work.nwait == work.nproc {
+			return nil
+		}
+		_g_ := getg()
+		if i < 10 {
+			_g_.m.gcstats.nprocyield++
+			procyield(20)
+		} else if i < 20 {
+			_g_.m.gcstats.nosyield++
+			osyield()
+		} else {
+			_g_.m.gcstats.nsleep++
+			usleep(100)
+		}
+	}
+}
+
+func handoff(b *workbuf) *workbuf {
+	// Make new buffer with half of b's pointers.
+	b1 := getempty(nil)
+	n := b.nobj / 2
+	b.nobj -= n
+	b1.nobj = n
+	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), n*unsafe.Sizeof(b1.obj[0]))
+	_g_ := getg()
+	_g_.m.gcstats.nhandoff++
+	_g_.m.gcstats.nhandoffcnt += uint64(n)
+
+	// Put b on full list - let first half of b get stolen.
+	lfstackpush(&work.full, &b.node)
+	return b1
+}
+
+func stackmapdata(stkmap *stackmap, n int32) bitvector {
+	if n < 0 || n >= stkmap.n {
+		gothrow("stackmapdata: index out of range")
+	}
+	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+31)/32*4))))}
+}
+
+// Scan a stack frame: local variables and function arguments/results.
+func scanframe(frame *stkframe, unused unsafe.Pointer) bool {
+
+	f := frame.fn
+	targetpc := frame.continpc
+	if targetpc == 0 {
+		// Frame is dead.
+		return true
+	}
+	if _DebugGC > 1 {
+		print("scanframe ", gofuncname(f), "\n")
+	}
+	if targetpc != f.entry {
+		targetpc--
+	}
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function.  It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+
+	// Scan local variables if stack frame has been allocated.
+	size := frame.varp - frame.sp
+	var minsize uintptr
+	if thechar != '6' && thechar != '8' {
+		minsize = ptrSize
+	} else {
+		minsize = 0
+	}
+	if size > minsize {
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+		if stkmap == nil || stkmap.n <= 0 {
+			print("runtime: frame ", gofuncname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+			gothrow("missing stackmap")
+		}
+
+		// Locals bitmap information, scan just the pointers in locals.
+		if pcdata < 0 || pcdata >= stkmap.n {
+			// don't know where we are
+			print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+			gothrow("scanframe: bad symbol table")
+		}
+		bv := stackmapdata(stkmap, pcdata)
+		size = (uintptr(bv.n) * ptrSize) / bitsPerPointer
+		scanblock(frame.varp-size, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+	}
+
+	// Scan arguments.
+	if frame.arglen > 0 {
+		var bv bitvector
+		if frame.argmap != nil {
+			bv = *frame.argmap
+		} else {
+			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+			if stkmap == nil || stkmap.n <= 0 {
+				print("runtime: frame ", gofuncname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
+				gothrow("missing stackmap")
+			}
+			if pcdata < 0 || pcdata >= stkmap.n {
+				// don't know where we are
+				print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+				gothrow("scanframe: bad symbol table")
+			}
+			bv = stackmapdata(stkmap, pcdata)
+		}
+		scanblock(frame.argp, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+	}
+	return true
+}
+
+func scanstack(gp *g) {
+	// TODO(rsc): Due to a precedence error, this was never checked in the original C version.
+	// If you enable the check, the gothrow happens.
+	/*
+		if readgstatus(gp)&_Gscan == 0 {
+			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+			gothrow("mark - bad status")
+		}
+	*/
+
+	switch readgstatus(gp) &^ _Gscan {
+	default:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		gothrow("mark - bad status")
+	case _Gdead:
+		return
+	case _Grunning:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		gothrow("mark - world not stopped")
+	case _Grunnable, _Gsyscall, _Gwaiting:
+		// ok
+	}
+
+	if gp == getg() {
+		gothrow("can't scan our own stack")
+	}
+	mp := gp.m
+	if mp != nil && mp.helpgc != 0 {
+		gothrow("can't scan gchelper stack")
+	}
+
+	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
+	tracebackdefers(gp, scanframe, nil)
+}
+
+// The gp has been moved to a gc safepoint. If there is gcphase specific
+// work it is done here.
+func gcphasework(gp *g) {
+	switch gcphase {
+	default:
+		gothrow("gcphasework in bad gcphase")
+	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
+		// No work for now.
+	case _GCmark:
+		// Disabled until concurrent GC is implemented
+		// but indicate the scan has been done.
+		// scanstack(gp);
+	}
+	gp.gcworkdone = true
+}
+
+var finalizer1 = [...]byte{
+	// Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
+	// Each byte describes 4 words.
+	// Need 4 Finalizers described by 5 bytes before pattern repeats:
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	// aka
+	//	ptr ptr uintptr ptr
+	//	ptr ptr ptr uintptr
+	//	ptr ptr ptr ptr
+	//	uintptr ptr ptr ptr
+	//	ptr uintptr ptr ptr
+	// Assumptions about Finalizer layout checked below.
+	bitsPointer | bitsPointer<<2 | bitsScalar<<4 | bitsPointer<<6,
+	bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsScalar<<6,
+	bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+	bitsScalar | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+	bitsPointer | bitsScalar<<2 | bitsPointer<<4 | bitsPointer<<6,
+}
+
+func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
+	lock(&finlock)
+	if finq == nil || finq.cnt == finq.cap {
+		if finc == nil {
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc.cap = int32((_FinBlockSize-unsafe.Sizeof(finblock{}))/unsafe.Sizeof(finalizer{}) + 1)
+			finc.alllink = allfin
+			allfin = finc
+			if finptrmask[0] == 0 {
+				// Build pointer mask for Finalizer array in block.
+				// Check assumptions made in finalizer1 array above.
+				if (unsafe.Sizeof(finalizer{}) != 5*ptrSize ||
+					unsafe.Offsetof(finalizer{}.fn) != 0 ||
+					unsafe.Offsetof(finalizer{}.arg) != ptrSize ||
+					unsafe.Offsetof(finalizer{}.nret) != 2*ptrSize ||
+					unsafe.Offsetof(finalizer{}.fint) != 3*ptrSize ||
+					unsafe.Offsetof(finalizer{}.ot) != 4*ptrSize ||
+					bitsPerPointer != 2) {
+					gothrow("finalizer out of sync")
+				}
+				for i := range finptrmask {
+					finptrmask[i] = finalizer1[i%len(finalizer1)]
+				}
+			}
+		}
+		block := finc
+		finc = block.next
+		block.next = finq
+		finq = block
+	}
+	f := (*finalizer)(add(unsafe.Pointer(&finq.fin[0]), uintptr(finq.cnt)*unsafe.Sizeof(finq.fin[0])))
+	finq.cnt++
+	f.fn = fn
+	f.nret = nret
+	f.fint = fint
+	f.ot = ot
+	f.arg = p
+	fingwake = true
+	unlock(&finlock)
+}
+
+func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrtype)) {
+	for fb := allfin; fb != nil; fb = fb.alllink {
+		for i := int32(0); i < fb.cnt; i++ {
+			f := &fb.fin[i]
+			callback(f.fn, f.arg, f.nret, f.fint, f.ot)
+		}
+	}
+}
+
+func mSpan_EnsureSwept(s *mspan) {
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		gothrow("MSpan_EnsureSwept: m is not locked")
+	}
+
+	sg := mheap_.sweepgen
+	if atomicload(&s.sweepgen) == sg {
+		return
+	}
+	if cas(&s.sweepgen, sg-2, sg-1) {
+		mSpan_Sweep(s, false)
+		return
+	}
+	// unfortunate condition, and we don't have efficient means to wait
+	for atomicload(&s.sweepgen) != sg {
+		osyield()
+	}
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// caller takes care of it.
+func mSpan_Sweep(s *mspan, preserve bool) bool {
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		gothrow("MSpan_Sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		gothrow("MSpan_Sweep: bad span state")
+	}
+	arena_start := mheap_.arena_start
+	cl := s.sizeclass
+	size := s.elemsize
+	var n int32
+	var npages int32
+	if cl == 0 {
+		n = 1
+	} else {
+		// Chunk full of small blocks.
+		npages = class_to_allocnpages[cl]
+		n = (npages << _PageShift) / int32(size)
+	}
+	res := false
+	nfree := 0
+	var head mlink
+	end := &head
+	c := _g_.m.mcache
+	sweepgenset := false
+
+	// Mark any free objects in this span so we don't collect them.
+	for link := s.freelist; link != nil; link = link.next {
+		off := (uintptr(unsafe.Pointer(link)) - arena_start) / ptrSize
+		bitp := arena_start - off/wordsPerBitmapByte - 1
+		shift := (off % wordsPerBitmapByte) * gcBits
+		*(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+	}
+
+	// Unlink & free special records for any objects we're about to free.
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
+		off := (p - arena_start) / ptrSize
+		bitp := arena_start - off/wordsPerBitmapByte - 1
+		shift := (off % wordsPerBitmapByte) * gcBits
+		bits := (*(*byte)(unsafe.Pointer(bitp)) >> shift) & bitMask
+		if bits&bitMarked == 0 {
+			// Find the exact byte for which the special was setup
+			// (as opposed to object beginning).
+			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
+			// about to free object: splice out special record
+			y := special
+			special = special.next
+			*specialp = special
+			if !freespecial(y, unsafe.Pointer(p), size, false) {
+				// stop freeing of object if it has a finalizer
+				*(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	p := uintptr(s.start << _PageShift)
+	off := (p - arena_start) / ptrSize
+	bitp := arena_start - off/wordsPerBitmapByte - 1
+	shift := uint(0)
+	step := size / (ptrSize * wordsPerBitmapByte)
+	// Rewind to the previous quadruple as we move to the next
+	// in the beginning of the loop.
+	bitp += step
+	if step == 0 {
+		// 8-byte objects.
+		bitp++
+		shift = gcBits
+	}
+	for ; n > 0; n, p = n-1, p+size {
+		bitp -= step
+		if step == 0 {
+			if shift != 0 {
+				bitp--
+			}
+			shift = gcBits - shift
+		}
+
+		xbits := *(*byte)(unsafe.Pointer(bitp))
+		bits := (xbits >> shift) & bitMask
+
+		// Allocated and marked object, reset bits to allocated.
+		if bits&bitMarked != 0 {
+			*(*byte)(unsafe.Pointer(bitp)) &^= bitMarked << shift
+			continue
+		}
+
+		// At this point we know that we are looking at garbage object
+		// that needs to be collected.
+		if debug.allocfreetrace != 0 {
+			tracefree(unsafe.Pointer(p), size)
+		}
+
+		// Reset to allocated+noscan.
+		*(*byte)(unsafe.Pointer(bitp)) = uint8(uintptr(xbits&^((bitMarked|bitsMask<<2)<<shift)) | uintptr(bitsDead)<<(shift+2))
+		if cl == 0 {
+			// Free large span.
+			if preserve {
+				gothrow("can't preserve large span")
+			}
+			unmarkspan(p, s.npages<<_PageShift)
+			s.needzero = 1
+
+			// important to set sweepgen before returning it to heap
+			atomicstore(&s.sweepgen, sweepgen)
+			sweepgenset = true
+
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used SysFree instead of SysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling SysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the MSpan structures or in the garbage collection bitmap.
+			// Using SysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to SysFree if we also
+			// implement and then call some kind of MHeap_DeleteSpan.
+			if debug.efence > 0 {
+				s.limit = 0 // prevent mlookup from finding this span
+				sysFault(unsafe.Pointer(p), size)
+			} else {
+				mHeap_Free(&mheap_, s, 1)
+			}
+			c.local_nlargefree++
+			c.local_largefree += size
+			xadd64(&memstats.next_gc, -int64(size)*int64(gcpercent+100)/100)
+			res = true
+		} else {
+			// Free small object.
+			if size > 2*ptrSize {
+				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
+			} else if size > ptrSize {
+				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
+			}
+			end.next = (*mlink)(unsafe.Pointer(p))
+			end = end.next
+			nfree++
+		}
+	}
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	if !sweepgenset && nfree == 0 {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+			gothrow("MSpan_Sweep: bad span state after sweep")
+		}
+		atomicstore(&s.sweepgen, sweepgen)
+	}
+	if nfree > 0 {
+		c.local_nsmallfree[cl] += uintptr(nfree)
+		c.local_cachealloc -= intptr(uintptr(nfree) * size)
+		xadd64(&memstats.next_gc, -int64(nfree)*int64(size)*int64(gcpercent+100)/100)
+		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head.next, end, preserve)
+		// MCentral_FreeSpan updates sweepgen
+	}
+	return res
+}
+
+// State of background sweep.
+// Protected by gclock.
+type sweepdata struct {
+	g       *g
+	parked  bool
+	started bool
+
+	spanidx uint32 // background sweeper position
+
+	nbgsweep    uint32
+	npausesweep uint32
+}
+
+var sweep sweepdata
+
+// sweeps one span
+// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+func sweepone() uintptr {
+	_g_ := getg()
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	_g_.m.locks++
+	sg := mheap_.sweepgen
+	for {
+		idx := xadd(&sweep.spanidx, 1) - 1
+		if idx >= uint32(len(work.spans)) {
+			mheap_.sweepdone = 1
+			_g_.m.locks--
+			return ^uintptr(0)
+		}
+		s := work.spans[idx]
+		if s.state != mSpanInUse {
+			s.sweepgen = sg
+			continue
+		}
+		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+			continue
+		}
+		npages := s.npages
+		if !mSpan_Sweep(s, false) {
+			npages = 0
+		}
+		_g_.m.locks--
+		return npages
+	}
+}
+
+func gosweepone() uintptr {
+	var ret uintptr
+	onM(func() {
+		ret = sweepone()
+	})
+	return ret
+}
+
+func gosweepdone() bool {
+	return mheap_.sweepdone != 0
+}
+
+func gchelper() {
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gchelperstart()
+
+	// parallel mark for over gc roots
+	parfordo(work.markfor)
+
+	// help other threads scan secondary blocks
+	scanblock(0, 0, nil)
+
+	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+	if xadd(&work.ndone, +1) == nproc-1 {
+		notewakeup(&work.alldone)
+	}
+	_g_.m.traceback = 0
+}
+
+func cachestats() {
+	for i := 0; ; i++ {
+		p := allp[i]
+		if p == nil {
+			break
+		}
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		purgecachedstats(c)
+	}
+}
+
+func flushallmcaches() {
+	for i := 0; ; i++ {
+		p := allp[i]
+		if p == nil {
+			break
+		}
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		mCache_ReleaseAll(c)
+		stackcache_clear(c)
+	}
+}
+
+func updatememstats(stats *gcstats) {
+	if stats != nil {
+		*stats = gcstats{}
+	}
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if stats != nil {
+			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
+			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
+			for i, v := range src {
+				dst[i] += v
+			}
+			mp.gcstats = gcstats{}
+		}
+	}
+
+	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
+	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
+	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
+		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+
+	// Calculate memory allocator stats.
+	// During program execution we only count number of frees and amount of freed memory.
+	// Current number of alive object in the heap and amount of alive heap memory
+	// are calculated by scanning all spans.
+	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+	// plus amount of alive heap memory.
+	memstats.alloc = 0
+	memstats.total_alloc = 0
+	memstats.nmalloc = 0
+	memstats.nfree = 0
+	for i := 0; i < len(memstats.by_size); i++ {
+		memstats.by_size[i].nmalloc = 0
+		memstats.by_size[i].nfree = 0
+	}
+
+	// Flush MCache's to MCentral.
+	onM(flushallmcaches)
+
+	// Aggregate local stats.
+	cachestats()
+
+	// Scan all spans and count number of alive objects.
+	lock(&mheap_.lock)
+	for i := uint32(0); i < mheap_.nspan; i++ {
+		s := h_allspans[i]
+		if s.state != mSpanInUse {
+			continue
+		}
+		if s.sizeclass == 0 {
+			memstats.nmalloc++
+			memstats.alloc += uint64(s.elemsize)
+		} else {
+			memstats.nmalloc += uint64(s.ref)
+			memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
+			memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
+		}
+	}
+	unlock(&mheap_.lock)
+
+	// Aggregate by size class.
+	smallfree := uint64(0)
+	memstats.nfree = mheap_.nlargefree
+	for i := 0; i < len(memstats.by_size); i++ {
+		memstats.nfree += mheap_.nsmallfree[i]
+		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
+		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
+		smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+	}
+	memstats.nfree += memstats.tinyallocs
+	memstats.nmalloc += memstats.nfree
+
+	// Calculate derived stats.
+	memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
+	memstats.heap_alloc = memstats.alloc
+	memstats.heap_objects = memstats.nmalloc - memstats.nfree
+}
+
+// Structure of arguments passed to function gc().
+// This allows the arguments to be passed via mcall.
+type gc_args struct {
+	start_time int64 // start time of GC in ns (just before stoptheworld)
+	eagersweep bool
+}
+
+func gcinit() {
+	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+		gothrow("runtime: size of Workbuf is suboptimal")
+	}
+
+	work.markfor = parforalloc(_MaxGcproc)
+	gcpercent = readgogc()
+	gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcdata)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
+	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
+}
+
+func gc_m() {
+	_g_ := getg()
+	gp := _g_.m.curg
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "garbage collection"
+
+	var a gc_args
+	a.start_time = int64(_g_.m.scalararg[0]) | int64(uintptr(_g_.m.scalararg[1]))<<32
+	a.eagersweep = _g_.m.scalararg[2] != 0
+	gc(&a)
+
+	if nbadblock > 0 {
+		// Work out path from root to bad block.
+		for {
+			gc(&a)
+			if nbadblock >= int32(len(badblock)) {
+				gothrow("cannot find path to bad pointer")
+			}
+		}
+	}
+
+	casgstatus(gp, _Gwaiting, _Grunning)
+}
+
+func gc(args *gc_args) {
+	if _DebugGCPtrs {
+		print("GC start\n")
+	}
+
+	if debug.allocfreetrace > 0 {
+		tracegc()
+	}
+
+	_g_ := getg()
+	_g_.m.traceback = 2
+	t0 := args.start_time
+	work.tstart = args.start_time
+
+	var t1 int64
+	if debug.gctrace > 0 {
+		t1 = nanotime()
+	}
+
+	// Sweep what is not sweeped by bgsweep.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	// Cache runtime.mheap_.allspans in work.spans to avoid conflicts with
+	// resizing/freeing allspans.
+	// New spans can be created while GC progresses, but they are not garbage for
+	// this round:
+	//  - new stack spans can be created even while the world is stopped.
+	//  - new malloc spans can be created during the concurrent sweep
+
+	// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+	lock(&mheap_.lock)
+	// Free the old cached sweep array if necessary.
+	if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+	}
+	// Cache the current array for marking.
+	mheap_.gcspans = mheap_.allspans
+	work.spans = h_allspans
+	unlock(&mheap_.lock)
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = uint32(gcprocs())
+	parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), nil, false, markroot)
+	if work.nproc > 1 {
+		noteclear(&work.alldone)
+		helpgc(int32(work.nproc))
+	}
+
+	var t2 int64
+	if debug.gctrace > 0 {
+		t2 = nanotime()
+	}
+
+	gchelperstart()
+	parfordo(work.markfor)
+	scanblock(0, 0, nil)
+
+	var t3 int64
+	if debug.gctrace > 0 {
+		t3 = nanotime()
+	}
+
+	if work.nproc > 1 {
+		notesleep(&work.alldone)
+	}
+
+	shrinkfinish()
+
+	cachestats()
+	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+	// estimate what was live heap size after previous GC (for printing only)
+	heap0 := memstats.next_gc * 100 / (uint64(gcpercent) + 100)
+	// conservatively set next_gc to high value assuming that everything is live
+	// concurrent/lazy sweep will reduce this number while discovering new garbage
+	memstats.next_gc = memstats.heap_alloc + memstats.heap_alloc*uint64(gcpercent)/100
+
+	t4 := nanotime()
+	atomicstore64(&memstats.last_gc, uint64(unixnanotime())) // must be Unix time to make sense to user
+	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(t4 - t0)
+	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(t4)
+	memstats.pause_total_ns += uint64(t4 - t0)
+	memstats.numgc++
+	if memstats.debuggc {
+		print("pause ", t4-t0, "\n")
+	}
+
+	if debug.gctrace > 0 {
+		heap1 := memstats.heap_alloc
+		var stats gcstats
+		updatememstats(&stats)
+		if heap1 != memstats.heap_alloc {
+			print("runtime: mstats skew: heap=", heap1, "/", memstats.heap_alloc, "\n")
+			gothrow("mstats skew")
+		}
+		obj := memstats.nmalloc - memstats.nfree
+
+		stats.nprocyield += work.markfor.nprocyield
+		stats.nosyield += work.markfor.nosyield
+		stats.nsleep += work.markfor.nsleep
+
+		print("gc", memstats.numgc, "(", work.nproc, "): ",
+			(t1-t0)/1000, "+", (t2-t1)/1000, "+", (t3-t2)/1000, "+", (t4-t3)/1000, " us, ",
+			heap0>>20, " -> ", heap1>>20, " MB, ",
+			obj, " (", memstats.nmalloc, "-", memstats.nfree, ") objects, ",
+			gcount(), " goroutines, ",
+			len(work.spans), "/", sweep.nbgsweep, "/", sweep.npausesweep, " sweeps, ",
+			stats.nhandoff, "(", stats.nhandoffcnt, ") handoff, ",
+			work.markfor.nsteal, "(", work.markfor.nstealcnt, ") steal, ",
+			stats.nprocyield, "/", stats.nosyield, "/", stats.nsleep, " yields\n")
+		sweep.nbgsweep = 0
+		sweep.npausesweep = 0
+	}
+
+	// See the comment in the beginning of this function as to why we need the following.
+	// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+	lock(&mheap_.lock)
+	// Free the old cached mark array if necessary.
+	if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+	}
+
+	// Cache the current array for sweeping.
+	mheap_.gcspans = mheap_.allspans
+	mheap_.sweepgen += 2
+	mheap_.sweepdone = 0
+	work.spans = h_allspans
+	sweep.spanidx = 0
+	unlock(&mheap_.lock)
+
+	if _ConcurrentSweep && !args.eagersweep {
+		lock(&gclock)
+		if !sweep.started {
+			go bgsweep()
+			sweep.started = true
+		} else if sweep.parked {
+			sweep.parked = false
+			ready(sweep.g)
+		}
+		unlock(&gclock)
+	} else {
+		// Sweep all spans eagerly.
+		for sweepone() != ^uintptr(0) {
+			sweep.npausesweep++
+		}
+		// Do an additional mProf_GC, because all 'free' events are now real as well.
+		mProf_GC()
+	}
+
+	mProf_GC()
+	_g_.m.traceback = 0
+
+	if _DebugGCPtrs {
+		print("GC end\n")
+	}
+}
+
+func readmemstats_m() {
+	_g_ := getg()
+	stats := (*mstats)(_g_.m.ptrarg[0])
+	_g_.m.ptrarg[0] = nil
+
+	updatememstats(nil)
+
+	// Size of the trailing by_size array differs between Go and C,
+	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
+
+	// Stack numbers are part of the heap numbers, separate those out for user consumption
+	stats.stacks_sys = stats.stacks_inuse
+	stats.heap_inuse -= stats.stacks_inuse
+	stats.heap_sys -= stats.stacks_inuse
+}
+
+//go:linkname readGCStats runtime/debug.readGCStats
+func readGCStats(pauses *[]uint64) {
+	onM(func() {
+		readGCStats_m(pauses)
+	})
+}
+
+func readGCStats_m(pauses *[]uint64) {
+	p := *pauses
+	// Calling code in runtime/debug should make the slice large enough.
+	if cap(p) < len(memstats.pause_ns)+3 {
+		gothrow("runtime: short slice passed to readGCStats")
+	}
+
+	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+	lock(&mheap_.lock)
+
+	n := memstats.numgc
+	if n > uint32(len(memstats.pause_ns)) {
+		n = uint32(len(memstats.pause_ns))
+	}
+
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%len(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	p = p[:cap(p)]
+	for i := uint32(0); i < n; i++ {
+		j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
+		p[i] = memstats.pause_ns[j]
+		p[n+i] = memstats.pause_end[j]
+	}
+
+	p[n+n] = memstats.last_gc
+	p[n+n+1] = uint64(memstats.numgc)
+	p[n+n+2] = memstats.pause_total_ns
+	unlock(&mheap_.lock)
+	*pauses = p[:n+n+3]
+}
+
+func setGCPercent(in int32) (out int32) {
+	lock(&mheap_.lock)
+	out = gcpercent
+	if in < 0 {
+		in = -1
+	}
+	gcpercent = in
+	unlock(&mheap_.lock)
+	return out
+}
+
+func gchelperstart() {
+	_g_ := getg()
+
+	if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
+		gothrow("gchelperstart: bad m->helpgc")
+	}
+	if _g_ != _g_.m.g0 {
+		gothrow("gchelper not running on g0 stack")
+	}
+}
+
+func wakefing() *g {
+	var res *g
+	lock(&finlock)
+	if fingwait && fingwake {
+		fingwait = false
+		fingwake = false
+		res = fing
+	}
+	unlock(&finlock)
+	return res
+}
+
+func addb(p *byte, n uintptr) *byte {
+	return (*byte)(add(unsafe.Pointer(p), n))
+}
+
+// Recursively unrolls GC program in prog.
+// mask is where to store the result.
+// ppos is a pointer to position in mask, in bits.
+// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
+func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+	arena_start := mheap_.arena_start
+	pos := *ppos
+	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
+	for {
+		switch *prog {
+		default:
+			gothrow("unrollgcprog: unknown instruction")
+
+		case insData:
+			prog = addb(prog, 1)
+			siz := int(*prog)
+			prog = addb(prog, 1)
+			p := (*[1 << 30]byte)(unsafe.Pointer(prog))
+			for i := 0; i < siz; i++ {
+				v := p[i/_PointersPerByte]
+				v >>= (uint(i) % _PointersPerByte) * _BitsPerPointer
+				v &= _BitsMask
+				if inplace {
+					// Store directly into GC bitmap.
+					off := (uintptr(unsafe.Pointer(&mask[pos])) - arena_start) / ptrSize
+					bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+					shift := (off % wordsPerBitmapByte) * gcBits
+					if shift == 0 {
+						*bitp = 0
+					}
+					*bitp |= v << (shift + 2)
+					pos += ptrSize
+				} else if sparse {
+					// 4-bits per word
+					v <<= (pos % 8) + 2
+					mask[pos/8] |= v
+					pos += gcBits
+				} else {
+					// 2-bits per word
+					v <<= pos % 8
+					mask[pos/8] |= v
+					pos += _BitsPerPointer
+				}
+			}
+			prog = addb(prog, round(uintptr(siz)*_BitsPerPointer, 8)/8)
+
+		case insArray:
+			prog = (*byte)(add(unsafe.Pointer(prog), 1))
+			siz := uintptr(0)
+			for i := uintptr(0); i < ptrSize; i++ {
+				siz = (siz << 8) + uintptr(*(*byte)(add(unsafe.Pointer(prog), ptrSize-i-1)))
+			}
+			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
+			var prog1 *byte
+			for i := uintptr(0); i < siz; i++ {
+				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+			}
+			if *prog1 != insArrayEnd {
+				gothrow("unrollgcprog: array does not end with insArrayEnd")
+			}
+			prog = (*byte)(add(unsafe.Pointer(prog1), 1))
+
+		case insArrayEnd, insEnd:
+			*ppos = pos
+			return prog
+		}
+	}
+}
+
+// Unrolls GC program prog for data/bss, returns dense GC mask.
+func unrollglobgcprog(prog *byte, size uintptr) bitvector {
+	masksize := round(round(size, ptrSize)/ptrSize*bitsPerPointer, 8) / 8
+	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
+	mask[masksize] = 0xa1
+	pos := uintptr(0)
+	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+	if pos != size/ptrSize*bitsPerPointer {
+		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize*bitsPerPointer, "\n")
+		gothrow("unrollglobgcprog: bad program size")
+	}
+	if *prog != insEnd {
+		gothrow("unrollglobgcprog: program does not end with insEnd")
+	}
+	if mask[masksize] != 0xa1 {
+		gothrow("unrollglobgcprog: overflow")
+	}
+	return bitvector{int32(masksize * 8), &mask[0]}
+}
+
+func unrollgcproginplace_m() {
+	_g_ := getg()
+
+	v := _g_.m.ptrarg[0]
+	typ := (*_type)(_g_.m.ptrarg[1])
+	size := _g_.m.scalararg[0]
+	size0 := _g_.m.scalararg[1]
+	_g_.m.ptrarg[0] = nil
+	_g_.m.ptrarg[1] = nil
+
+	pos := uintptr(0)
+	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+	for pos != size0 {
+		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+	}
+
+	// Mark first word as bitAllocated.
+	arena_start := mheap_.arena_start
+	off := (uintptr(v) - arena_start) / ptrSize
+	bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+	shift := (off % wordsPerBitmapByte) * gcBits
+	*bitp |= bitBoundary << shift
+
+	// Mark word after last as BitsDead.
+	if size0 < size {
+		off := (uintptr(v) + size0 - arena_start) / ptrSize
+		bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+		shift := (off % wordsPerBitmapByte) * gcBits
+		*bitp &= uint8(^(bitPtrMask << shift) | uintptr(bitsDead)<<(shift+2))
+	}
+}
+
+var unroll mutex
+
+// Unrolls GC program in typ.gc[1] into typ.gc[0]
+func unrollgcprog_m() {
+	_g_ := getg()
+
+	typ := (*_type)(_g_.m.ptrarg[0])
+	_g_.m.ptrarg[0] = nil
+
+	lock(&unroll)
+	mask := (*byte)(unsafe.Pointer(uintptr(typ.gc[0])))
+	if *mask == 0 {
+		pos := uintptr(8) // skip the unroll flag
+		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+		prog = unrollgcprog1(mask, prog, &pos, false, true)
+		if *prog != insEnd {
+			gothrow("unrollgcprog: program does not end with insEnd")
+		}
+		if typ.size/ptrSize%2 != 0 {
+			// repeat the program
+			prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+			unrollgcprog1(mask, prog, &pos, false, true)
+		}
+		// atomic way to say mask[0] = 1
+		x := *(*uintptr)(unsafe.Pointer(mask))
+		*(*byte)(unsafe.Pointer(&x)) = 1
+		atomicstoreuintptr((*uintptr)(unsafe.Pointer(mask)), x)
+	}
+	unlock(&unroll)
+}
+
+// mark the span of memory at v as having n blocks of the given size.
+// if leftover is true, there is left over space at the end of the span.
+func markspan(v unsafe.Pointer, size uintptr, n uintptr, leftover bool) {
+	if uintptr(v)+size*n > mheap_.arena_used || uintptr(v) < mheap_.arena_start {
+		gothrow("markspan: bad pointer")
+	}
+
+	// Find bits of the beginning of the span.
+	off := (uintptr(v) - uintptr(mheap_.arena_start)) / ptrSize
+	if off%wordsPerBitmapByte != 0 {
+		gothrow("markspan: unaligned length")
+	}
+	b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+
+	// Okay to use non-atomic ops here, because we control
+	// the entire span, and each bitmap byte has bits for only
+	// one span, so no other goroutines are changing these bitmap words.
+
+	if size == ptrSize {
+		// Possible only on 64-bits (minimal size class is 8 bytes).
+		// Set memory to 0x11.
+		if (bitBoundary|bitsDead)<<gcBits|bitBoundary|bitsDead != 0x11 {
+			gothrow("markspan: bad bits")
+		}
+		if n%(wordsPerBitmapByte*ptrSize) != 0 {
+			gothrow("markspan: unaligned length")
+		}
+		b = b - n/wordsPerBitmapByte + 1 // find first byte
+		if b%ptrSize != 0 {
+			gothrow("markspan: unaligned pointer")
+		}
+		for i := uintptr(0); i < n; i, b = i+wordsPerBitmapByte*ptrSize, b+ptrSize {
+			*(*uintptr)(unsafe.Pointer(b)) = uintptrMask & 0x1111111111111111 // bitBoundary | bitsDead, repeated
+		}
+		return
+	}
+
+	if leftover {
+		n++ // mark a boundary just past end of last block too
+	}
+	step := size / (ptrSize * wordsPerBitmapByte)
+	for i := uintptr(0); i < n; i, b = i+1, b-step {
+		*(*byte)(unsafe.Pointer(b)) = bitBoundary | bitsDead<<2
+	}
+}
+
+// unmark the span of memory at v of length n bytes.
+func unmarkspan(v, n uintptr) {
+	if v+n > mheap_.arena_used || v < mheap_.arena_start {
+		gothrow("markspan: bad pointer")
+	}
+
+	off := (v - mheap_.arena_start) / ptrSize // word offset
+	if off%(ptrSize*wordsPerBitmapByte) != 0 {
+		gothrow("markspan: unaligned pointer")
+	}
+
+	b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+	n /= ptrSize
+	if n%(ptrSize*wordsPerBitmapByte) != 0 {
+		gothrow("unmarkspan: unaligned length")
+	}
+
+	// Okay to use non-atomic ops here, because we control
+	// the entire span, and each bitmap word has bits for only
+	// one span, so no other goroutines are changing these
+	// bitmap words.
+	n /= wordsPerBitmapByte
+	memclr(unsafe.Pointer(b-n+1), n)
+}
+
+func mHeap_MapBits(h *mheap) {
+	// Caller has added extra mappings to the arena.
+	// Add extra mappings of bitmap words as needed.
+	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+	const bitmapChunk = 8192
+
+	n := (h.arena_used - h.arena_start) / (ptrSize * wordsPerBitmapByte)
+	n = round(n, bitmapChunk)
+	n = round(n, _PhysPageSize)
+	if h.bitmap_mapped >= n {
+		return
+	}
+
+	sysMap(unsafe.Pointer(h.arena_start-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+	h.bitmap_mapped = n
+}
+
+func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
+	target := (*stkframe)(ctxt)
+	if frame.sp <= target.sp && target.sp < frame.varp {
+		*target = *frame
+		return false
+	}
+	return true
+}
+
+// Returns GC type info for object p for testing.
+func getgcmask(p unsafe.Pointer, t *_type, mask **byte, len *uintptr) {
+	*mask = nil
+	*len = 0
+
+	// data
+	if uintptr(unsafe.Pointer(&data)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&edata)) {
+		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+		*len = n / ptrSize
+		*mask = &make([]byte, *len)[0]
+		for i := uintptr(0); i < n; i += ptrSize {
+			off := (uintptr(p) + i - uintptr(unsafe.Pointer(&data))) / ptrSize
+			bits := (*(*byte)(add(unsafe.Pointer(gcdatamask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+		}
+		return
+	}
+
+	// bss
+	if uintptr(unsafe.Pointer(&bss)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&ebss)) {
+		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+		*len = n / ptrSize
+		*mask = &make([]byte, *len)[0]
+		for i := uintptr(0); i < n; i += ptrSize {
+			off := (uintptr(p) + i - uintptr(unsafe.Pointer(&bss))) / ptrSize
+			bits := (*(*byte)(add(unsafe.Pointer(gcbssmask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+		}
+		return
+	}
+
+	// heap
+	var n uintptr
+	var base uintptr
+	if mlookup(uintptr(p), &base, &n, nil) != 0 {
+		*len = n / ptrSize
+		*mask = &make([]byte, *len)[0]
+		for i := uintptr(0); i < n; i += ptrSize {
+			off := (uintptr(base) + i - mheap_.arena_start) / ptrSize
+			b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+			shift := (off % wordsPerBitmapByte) * gcBits
+			bits := (*(*byte)(unsafe.Pointer(b)) >> (shift + 2)) & bitsMask
+			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+		}
+		return
+	}
+
+	// stack
+	var frame stkframe
+	frame.sp = uintptr(p)
+	_g_ := getg()
+	gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+	if frame.fn != nil {
+		f := frame.fn
+		targetpc := frame.continpc
+		if targetpc == 0 {
+			return
+		}
+		if targetpc != f.entry {
+			targetpc--
+		}
+		pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+		if pcdata == -1 {
+			return
+		}
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+		if stkmap == nil || stkmap.n <= 0 {
+			return
+		}
+		bv := stackmapdata(stkmap, pcdata)
+		size := uintptr(bv.n) / bitsPerPointer * ptrSize
+		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+		*len = n / ptrSize
+		*mask = &make([]byte, *len)[0]
+		for i := uintptr(0); i < n; i += ptrSize {
+			off := (uintptr(p) + i - frame.varp + size) / ptrSize
+			bits := ((*(*byte)(add(unsafe.Pointer(bv.bytedata), off*bitsPerPointer/8))) >> ((off * bitsPerPointer) % 8)) & bitsMask
+			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+		}
+	}
+}
+
+func unixnanotime() int64 {
+	var now int64
+	gc_unixnanotime(&now)
+	return now
+}

diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c
deleted file mode 100644
index 7754bad..0000000
--- a/src/runtime/mgc0.c
+++ /dev/null

@@ -1,2010 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Garbage collector (GC).
-//
-// GC is:
-// - mark&sweep
-// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
-// - parallel (up to MaxGcproc threads)
-// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
-// - non-moving/non-compacting
-// - full (non-partial)
-//
-// GC rate.
-// Next GC is after we've allocated an extra amount of memory proportional to
-// the amount already in use. The proportion is controlled by GOGC environment variable
-// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
-// (and also the amount of extra memory used).
-//
-// Concurrent sweep.
-// The sweep phase proceeds concurrently with normal program execution.
-// The heap is swept span-by-span both lazily (when a goroutine needs another span)
-// and concurrently in a background goroutine (this helps programs that are not CPU bound).
-// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
-// and so next_gc calculation is tricky and happens as follows.
-// At the end of the stop-the-world phase next_gc is conservatively set based on total
-// heap size; all spans are marked as "needs sweeping".
-// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
-// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
-// closer to the target value. However, this is not enough to avoid over-allocating memory.
-// Consider that a goroutine wants to allocate a new span for a large object and
-// there are no free swept spans, but there are small-object unswept spans.
-// If the goroutine naively allocates a new span, it can surpass the yet-unknown
-// target next_gc value. In order to prevent such cases (1) when a goroutine needs
-// to allocate a new small-object span, it sweeps small-object spans for the same
-// object size until it frees at least one object; (2) when a goroutine needs to
-// allocate large-object span from heap, it sweeps spans until it frees at least
-// that many pages into heap. Together these two measures ensure that we don't surpass
-// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
-// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
-// but there can still be other one-page unswept spans which could be combined into a two-page span.
-// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
-// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
-// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
-// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
-// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
-// The finalizer goroutine is kicked off only when all spans are swept.
-// When the next GC starts, it sweeps all not-yet-swept spans (if any).
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-#include "stack.h"
-#include "mgc0.h"
-#include "chan.h"
-#include "race.h"
-#include "type.h"
-#include "typekind.h"
-#include "funcdata.h"
-#include "textflag.h"
-
-enum {
-	Debug		= 0,
-	DebugPtrs	= 0, // if 1, print trace of every pointer load during GC
-	ConcurrentSweep	= 1,
-
-	WorkbufSize	= 4*1024,
-	FinBlockSize	= 4*1024,
-	RootData	= 0,
-	RootBss		= 1,
-	RootFinalizers	= 2,
-	RootSpans	= 3,
-	RootFlushCaches = 4,
-	RootCount	= 5,
-};
-
-// ptrmask for an allocation containing a single pointer.
-static byte oneptr[] = {BitsPointer};
-
-// Initialized from $GOGC.  GOGC=off means no gc.
-extern int32 runtime·gcpercent;
-
-// Holding worldsema grants an M the right to try to stop the world.
-// The procedure is:
-//
-//	runtime·semacquire(&runtime·worldsema);
-//	m->gcing = 1;
-//	runtime·stoptheworld();
-//
-//	... do stuff ...
-//
-//	m->gcing = 0;
-//	runtime·semrelease(&runtime·worldsema);
-//	runtime·starttheworld();
-//
-uint32 runtime·worldsema = 1;
-
-typedef struct Workbuf Workbuf;
-struct Workbuf
-{
-	LFNode	node; // must be first
-	uintptr	nobj;
-	byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
-};
-
-extern byte runtime·data[];
-extern byte runtime·edata[];
-extern byte runtime·bss[];
-extern byte runtime·ebss[];
-
-extern byte runtime·gcdata[];
-extern byte runtime·gcbss[];
-
-Mutex	runtime·finlock;	// protects the following variables
-G*	runtime·fing;		// goroutine that runs finalizers
-FinBlock*	runtime·finq;	// list of finalizers that are to be executed
-FinBlock*	runtime·finc;	// cache of free blocks
-static byte finptrmask[FinBlockSize/PtrSize/PointersPerByte];
-bool	runtime·fingwait;
-bool	runtime·fingwake;
-FinBlock	*runtime·allfin;	// list of all blocks
-
-BitVector	runtime·gcdatamask;
-BitVector	runtime·gcbssmask;
-
-Mutex	runtime·gclock;
-
-static	uintptr	badblock[1024];
-static	int32	nbadblock;
-
-static Workbuf* getempty(Workbuf*);
-static Workbuf* getfull(Workbuf*);
-static void	putempty(Workbuf*);
-static Workbuf* handoff(Workbuf*);
-static void	gchelperstart(void);
-static void	flushallmcaches(void);
-static bool	scanframe(Stkframe *frame, void *unused);
-static void	scanstack(G *gp);
-static BitVector	unrollglobgcprog(byte *prog, uintptr size);
-
-void runtime·bgsweep(void);
-static FuncVal bgsweepv = {runtime·bgsweep};
-
-typedef struct WorkData WorkData;
-struct WorkData {
-	uint64	full;  // lock-free list of full blocks
-	uint64	empty; // lock-free list of empty blocks
-	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
-	uint32	nproc;
-	int64	tstart;
-	volatile uint32	nwait;
-	volatile uint32	ndone;
-	Note	alldone;
-	ParFor*	markfor;
-
-	// Copy of mheap.allspans for marker or sweeper.
-	MSpan**	spans;
-	uint32	nspan;
-};
-WorkData runtime·work;
-
-// Is _cgo_allocate linked into the binary?
-static bool
-have_cgo_allocate(void)
-{
-	extern	byte	go·weak·runtime·_cgo_allocate_internal[1];
-	return go·weak·runtime·_cgo_allocate_internal != nil;
-}
-
-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
-static void
-scanblock(byte *b, uintptr n, byte *ptrmask)
-{
-	byte *obj, *obj0, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp;
-	uintptr i, j, nobj, size, idx, x, off, scanbufpos, bits, xbits, shift;
-	Workbuf *wbuf;
-	Iface *iface;
-	Eface *eface;
-	Type *typ;
-	MSpan *s;
-	pageID k;
-	bool keepworking;
-
-	// Cache memory arena parameters in local vars.
-	arena_start = runtime·mheap.arena_start;
-	arena_used = runtime·mheap.arena_used;
-
-	wbuf = getempty(nil);
-	nobj = wbuf->nobj;
-	wp = &wbuf->obj[nobj];
-	keepworking = b == nil;
-	scanbufpos = 0;
-	for(i = 0; i < nelem(scanbuf); i++)
-		scanbuf[i] = nil;
-
-	ptrbitp = nil;
-
-	// ptrmask can have 2 possible values:
-	// 1. nil - obtain pointer mask from GC bitmap.
-	// 2. pointer to a compact mask (for stacks and data).
-	if(b != nil)
-		goto scanobj;
-	for(;;) {
-		if(nobj == 0) {
-			// Out of work in workbuf.
-			// First, see is there is any work in scanbuf.
-			for(i = 0; i < nelem(scanbuf); i++) {
-				b = scanbuf[scanbufpos];
-				scanbuf[scanbufpos++] = nil;
-				scanbufpos %= nelem(scanbuf);
-				if(b != nil) {
-					n = arena_used - b; // scan until bitBoundary or BitsDead
-					ptrmask = nil; // use GC bitmap for pointer info
-					goto scanobj;
-				}
-			}
-			if(!keepworking) {
-				putempty(wbuf);
-				return;
-			}
-			// Refill workbuf from global queue.
-			wbuf = getfull(wbuf);
-			if(wbuf == nil)
-				return;
-			nobj = wbuf->nobj;
-			wp = &wbuf->obj[nobj];
-		}
-
-		// If another proc wants a pointer, give it some.
-		if(runtime·work.nwait > 0 && nobj > 4 && runtime·work.full == 0) {
-			wbuf->nobj = nobj;
-			wbuf = handoff(wbuf);
-			nobj = wbuf->nobj;
-			wp = &wbuf->obj[nobj];
-		}
-
-		wp--;
-		nobj--;
-		b = *wp;
-		n = arena_used - b; // scan until next bitBoundary or BitsDead
-		ptrmask = nil; // use GC bitmap for pointer info
-
-	scanobj:
-		if(DebugPtrs)
-			runtime·printf("scanblock %p +%p %p\n", b, n, ptrmask);
-		// Find bits of the beginning of the object.
-		if(ptrmask == nil) {
-			off = (uintptr*)b - (uintptr*)arena_start;
-			ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
-		}
-		for(i = 0; i < n; i += PtrSize) {
-			obj = nil;
-			// Find bits for this word.
-			if(ptrmask == nil) {
-				// Check is we have reached end of span.
-				if((((uintptr)b+i)%PageSize) == 0 &&
-					runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
-					break;
-				// Consult GC bitmap.
-				bits = *ptrbitp;
-
-				if(wordsPerBitmapByte != 2)
-					runtime·throw("alg doesn't work for wordsPerBitmapByte != 2");
-				j = ((uintptr)b+i)/PtrSize & 1;
-				ptrbitp -= j;
-				bits >>= gcBits*j;
-
-				if((bits&bitBoundary) != 0 && i != 0)
-					break; // reached beginning of the next object
-				bits = (bits>>2)&BitsMask;
-				if(bits == BitsDead)
-					break; // reached no-scan part of the object
-			} else // dense mask (stack or data)
-				bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
-
-			if(bits <= BitsScalar) // BitsScalar || BitsDead
-				continue;
-			if(bits == BitsPointer) {
-				obj = *(byte**)(b+i);
-				obj0 = obj;
-				goto markobj;
-			}
-
-			// With those three out of the way, must be multi-word.
-			if(Debug && bits != BitsMultiWord)
-				runtime·throw("unexpected garbage collection bits");
-			// Find the next pair of bits.
-			if(ptrmask == nil) {
-				bits = *ptrbitp;
-				j = ((uintptr)b+i+PtrSize)/PtrSize & 1;
-				ptrbitp -= j;
-				bits >>= gcBits*j;
-				bits = (bits>>2)&BitsMask;
-			} else
-				bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask;
-
-			if(Debug && bits != BitsIface && bits != BitsEface)
-				runtime·throw("unexpected garbage collection bits");
-
-			if(bits == BitsIface) {
-				iface = (Iface*)(b+i);
-				if(iface->tab != nil) {
-					typ = iface->tab->type;
-					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-						obj = iface->data;
-				}
-			} else {
-				eface = (Eface*)(b+i);
-				typ = eface->type;
-				if(typ != nil) {
-					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-						obj = eface->data;
-				}
-			}
-
-			i += PtrSize;
-
-			obj0 = obj;
-		markobj:
-			// At this point we have extracted the next potential pointer.
-			// Check if it points into heap.
-			if(obj == nil)
-				continue;
-			if(obj < arena_start || obj >= arena_used) {
-				if((uintptr)obj < PhysPageSize && runtime·invalidptr) {
-					s = nil;
-					goto badobj;
-				}
-				continue;
-			}
-			// Mark the object.
-			obj = (byte*)((uintptr)obj & ~(PtrSize-1));
-			off = (uintptr*)obj - (uintptr*)arena_start;
-			bitp = arena_start - off/wordsPerBitmapByte - 1;
-			shift = (off % wordsPerBitmapByte) * gcBits;
-			xbits = *bitp;
-			bits = (xbits >> shift) & bitMask;
-			if((bits&bitBoundary) == 0) {
-				// Not a beginning of a block, consult span table to find the block beginning.
-				k = (uintptr)obj>>PageShift;
-				x = k;
-				x -= (uintptr)arena_start>>PageShift;
-				s = runtime·mheap.spans[x];
-				if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse) {
-					// Stack pointers lie within the arena bounds but are not part of the GC heap.
-					// Ignore them.
-					if(s != nil && s->state == MSpanStack)
-						continue;
-				
-				badobj:
-					// If cgo_allocate is linked into the binary, it can allocate
-					// memory as []unsafe.Pointer that may not contain actual
-					// pointers and must be scanned conservatively.
-					// In this case alone, allow the bad pointer.
-					if(have_cgo_allocate() && ptrmask == nil)
-						continue;
-
-					// Anything else indicates a bug somewhere.
-					// If we're in the middle of chasing down a different bad pointer,
-					// don't confuse the trace by printing about this one.
-					if(nbadblock > 0)
-						continue;
-
-					runtime·printf("runtime: garbage collector found invalid heap pointer *(%p+%p)=%p", b, i, obj);
-					if(s == nil)
-						runtime·printf(" s=nil\n");
-					else
-						runtime·printf(" span=%p-%p-%p state=%d\n", (uintptr)s->start<<PageShift, s->limit, (uintptr)(s->start+s->npages)<<PageShift, s->state);
-					if(ptrmask != nil)
-						runtime·throw("invalid heap pointer");
-					// Add to badblock list, which will cause the garbage collection
-					// to keep repeating until it has traced the chain of pointers
-					// leading to obj all the way back to a root.
-					if(nbadblock == 0)
-						badblock[nbadblock++] = (uintptr)b;
-					continue;
-				}
-				p = (byte*)((uintptr)s->start<<PageShift);
-				if(s->sizeclass != 0) {
-					size = s->elemsize;
-					idx = ((byte*)obj - p)/size;
-					p = p+idx*size;
-				}
-				if(p == obj) {
-					runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
-						p, s->start*PageSize, s->limit);
-					runtime·throw("failed to find block beginning");
-				}
-				obj = p;
-				goto markobj;
-			}
-			if(DebugPtrs)
-				runtime·printf("scan *%p = %p => base %p\n", b+i, obj0, obj);
-
-			if(nbadblock > 0 && (uintptr)obj == badblock[nbadblock-1]) {
-				// Running garbage collection again because
-				// we want to find the path from a root to a bad pointer.
-				// Found possible next step; extend or finish path.
-				for(j=0; j<nbadblock; j++)
-					if(badblock[j] == (uintptr)b)
-						goto AlreadyBad;
-				runtime·printf("runtime: found *(%p+%p) = %p+%p\n", b, i, obj0, (uintptr)(obj-obj0));
-				if(ptrmask != nil)
-					runtime·throw("bad pointer");
-				if(nbadblock >= nelem(badblock))
-					runtime·throw("badblock trace too long");
-				badblock[nbadblock++] = (uintptr)b;
-			AlreadyBad:;
-			}
-
-			// Now we have bits, bitp, and shift correct for
-			// obj pointing at the base of the object.
-			// Only care about not marked objects.
-			if((bits&bitMarked) != 0)
-				continue;
-			// If obj size is greater than 8, then each byte of GC bitmap
-			// contains info for at most one object. In such case we use
-			// non-atomic byte store to mark the object. This can lead
-			// to double enqueue of the object for scanning, but scanning
-			// is an idempotent operation, so it is OK. This cannot lead
-			// to bitmap corruption because the single marked bit is the
-			// only thing that can change in the byte.
-			// For 8-byte objects we use non-atomic store, if the other
-			// quadruple is already marked. Otherwise we resort to CAS
-			// loop for marking.
-			if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
-				runtime·work.nproc == 1)
-				*bitp = xbits | (bitMarked<<shift);
-			else
-				runtime·atomicor8(bitp, bitMarked<<shift);
-
-			if(((xbits>>(shift+2))&BitsMask) == BitsDead)
-				continue;  // noscan object
-
-			// Queue the obj for scanning.
-			PREFETCH(obj);
-			p = scanbuf[scanbufpos];
-			scanbuf[scanbufpos++] = obj;
-			scanbufpos %= nelem(scanbuf);
-			if(p == nil)
-				continue;
-
-			// If workbuf is full, obtain an empty one.
-			if(nobj >= nelem(wbuf->obj)) {
-				wbuf->nobj = nobj;
-				wbuf = getempty(wbuf);
-				nobj = wbuf->nobj;
-				wp = &wbuf->obj[nobj];
-			}
-			*wp = p;
-			wp++;
-			nobj++;
-		}
-		if(DebugPtrs)
-			runtime·printf("end scanblock %p +%p %p\n", b, n, ptrmask);
-
-		if(Debug && ptrmask == nil) {
-			// For heap objects ensure that we did not overscan.
-			n = 0;
-			p = nil;
-			if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) {
-				runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n);
-				runtime·throw("scanblock: scanned invalid object");
-			}
-		}
-	}
-}
-
-static void
-markroot(ParFor *desc, uint32 i)
-{
-	FinBlock *fb;
-	MSpan *s;
-	uint32 spanidx, sg;
-	G *gp;
-	void *p;
-	uint32 status;
-	bool restart;
-
-	USED(&desc);
-	// Note: if you add a case here, please also update heapdump.c:dumproots.
-	switch(i) {
-	case RootData:
-		scanblock(runtime·data, runtime·edata - runtime·data, runtime·gcdatamask.bytedata);
-		break;
-
-	case RootBss:
-		scanblock(runtime·bss, runtime·ebss - runtime·bss, runtime·gcbssmask.bytedata);
-		break;
-
-	case RootFinalizers:
-		for(fb=runtime·allfin; fb; fb=fb->alllink)
-			scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), finptrmask);
-		break;
-
-	case RootSpans:
-		// mark MSpan.specials
-		sg = runtime·mheap.sweepgen;
-		for(spanidx=0; spanidx<runtime·work.nspan; spanidx++) {
-			Special *sp;
-			SpecialFinalizer *spf;
-
-			s = runtime·work.spans[spanidx];
-			if(s->state != MSpanInUse)
-				continue;
-			if(s->sweepgen != sg) {
-				runtime·printf("sweep %d %d\n", s->sweepgen, sg);
-				runtime·throw("gc: unswept span");
-			}
-			for(sp = s->specials; sp != nil; sp = sp->next) {
-				if(sp->kind != KindSpecialFinalizer)
-					continue;
-				// don't mark finalized object, but scan it so we
-				// retain everything it points to.
-				spf = (SpecialFinalizer*)sp;
-				// A finalizer can be set for an inner byte of an object, find object beginning.
-				p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
-				scanblock(p, s->elemsize, nil);
-				scanblock((void*)&spf->fn, PtrSize, oneptr);
-			}
-		}
-		break;
-
-	case RootFlushCaches:
-		flushallmcaches();
-		break;
-
-	default:
-		// the rest is scanning goroutine stacks
-		if(i - RootCount >= runtime·allglen)
-			runtime·throw("markroot: bad index");
-		gp = runtime·allg[i - RootCount];
-		// remember when we've first observed the G blocked
-		// needed only to output in traceback
-		status = runtime·readgstatus(gp);
-		if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0)
-			gp->waitsince = runtime·work.tstart;
-		// Shrink a stack if not much of it is being used.
-		runtime·shrinkstack(gp);
-		if(runtime·readgstatus(gp) == Gdead) 
-			gp->gcworkdone = true;
-		else 
-			gp->gcworkdone = false; 
-		restart = runtime·stopg(gp);
-		scanstack(gp);
-		if(restart)
-			runtime·restartg(gp);
-		break;
-	}
-}
-
-// Get an empty work buffer off the work.empty list,
-// allocating new buffers as needed.
-static Workbuf*
-getempty(Workbuf *b)
-{
-	MCache *c;
-
-	if(b != nil)
-		runtime·lfstackpush(&runtime·work.full, &b->node);
-	b = nil;
-	c = g->m->mcache;
-	if(c->gcworkbuf != nil) {
-		b = c->gcworkbuf;
-		c->gcworkbuf = nil;
-	}
-	if(b == nil)
-		b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty);
-	if(b == nil)
-		b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys);
-	b->nobj = 0;
-	return b;
-}
-
-static void
-putempty(Workbuf *b)
-{
-	MCache *c;
-
-	c = g->m->mcache;
-	if(c->gcworkbuf == nil) {
-		c->gcworkbuf = b;
-		return;
-	}
-	runtime·lfstackpush(&runtime·work.empty, &b->node);
-}
-
-void
-runtime·gcworkbuffree(void *b)
-{
-	if(b != nil)
-		putempty(b);
-}
-
-// Get a full work buffer off the work.full list, or return nil.
-static Workbuf*
-getfull(Workbuf *b)
-{
-	int32 i;
-
-	if(b != nil)
-		runtime·lfstackpush(&runtime·work.empty, &b->node);
-	b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
-	if(b != nil || runtime·work.nproc == 1)
-		return b;
-
-	runtime·xadd(&runtime·work.nwait, +1);
-	for(i=0;; i++) {
-		if(runtime·work.full != 0) {
-			runtime·xadd(&runtime·work.nwait, -1);
-			b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
-			if(b != nil)
-				return b;
-			runtime·xadd(&runtime·work.nwait, +1);
-		}
-		if(runtime·work.nwait == runtime·work.nproc)
-			return nil;
-		if(i < 10) {
-			g->m->gcstats.nprocyield++;
-			runtime·procyield(20);
-		} else if(i < 20) {
-			g->m->gcstats.nosyield++;
-			runtime·osyield();
-		} else {
-			g->m->gcstats.nsleep++;
-			runtime·usleep(100);
-		}
-	}
-}
-
-static Workbuf*
-handoff(Workbuf *b)
-{
-	int32 n;
-	Workbuf *b1;
-
-	// Make new buffer with half of b's pointers.
-	b1 = getempty(nil);
-	n = b->nobj/2;
-	b->nobj -= n;
-	b1->nobj = n;
-	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-	g->m->gcstats.nhandoff++;
-	g->m->gcstats.nhandoffcnt += n;
-
-	// Put b on full list - let first half of b get stolen.
-	runtime·lfstackpush(&runtime·work.full, &b->node);
-	return b1;
-}
-
-BitVector
-runtime·stackmapdata(StackMap *stackmap, int32 n)
-{
-	if(n < 0 || n >= stackmap->n)
-		runtime·throw("stackmapdata: index out of range");
-	return (BitVector){stackmap->nbit, stackmap->bytedata + n*((stackmap->nbit+31)/32*4)};
-}
-
-// Scan a stack frame: local variables and function arguments/results.
-static bool
-scanframe(Stkframe *frame, void *unused)
-{
-	Func *f;
-	StackMap *stackmap;
-	BitVector bv;
-	uintptr size, minsize;
-	uintptr targetpc;
-	int32 pcdata;
-
-	USED(unused);
-	f = frame->fn;
-	targetpc = frame->continpc;
-	if(targetpc == 0) {
-		// Frame is dead.
-		return true;
-	}
-	if(Debug > 1)
-		runtime·printf("scanframe %s\n", runtime·funcname(f));
-	if(targetpc != f->entry)
-		targetpc--;
-	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
-	if(pcdata == -1) {
-		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function.  It is likely that we are looking
-		// at the function prologue, assume so and hope for the best.
-		pcdata = 0;
-	}
-
-	// Scan local variables if stack frame has been allocated.
-	size = frame->varp - frame->sp;
-	if(thechar != '6' && thechar != '8')
-		minsize = sizeof(uintptr);
-	else
-		minsize = 0;
-	if(size > minsize) {
-		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
-		if(stackmap == nil || stackmap->n <= 0) {
-			runtime·printf("runtime: frame %s untyped locals %p+%p\n", runtime·funcname(f), (byte*)(frame->varp-size), size);
-			runtime·throw("missing stackmap");
-		}
-
-		// Locals bitmap information, scan just the pointers in locals.
-		if(pcdata < 0 || pcdata >= stackmap->n) {
-			// don't know where we are
-			runtime·printf("runtime: pcdata is %d and %d locals stack map entries for %s (targetpc=%p)\n",
-				pcdata, stackmap->n, runtime·funcname(f), targetpc);
-			runtime·throw("scanframe: bad symbol table");
-		}
-		bv = runtime·stackmapdata(stackmap, pcdata);
-		size = (bv.n * PtrSize) / BitsPerPointer;
-		scanblock((byte*)(frame->varp - size), bv.n/BitsPerPointer*PtrSize, bv.bytedata);
-	}
-
-	// Scan arguments.
-	if(frame->arglen > 0) {
-		if(frame->argmap != nil)
-			bv = *frame->argmap;
-		else {
-			stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
-			if(stackmap == nil || stackmap->n <= 0) {
-				runtime·printf("runtime: frame %s untyped args %p+%p\n", runtime·funcname(f), frame->argp, (uintptr)frame->arglen);
-				runtime·throw("missing stackmap");
-			}
-			if(pcdata < 0 || pcdata >= stackmap->n) {
-				// don't know where we are
-				runtime·printf("runtime: pcdata is %d and %d args stack map entries for %s (targetpc=%p)\n",
-					pcdata, stackmap->n, runtime·funcname(f), targetpc);
-				runtime·throw("scanframe: bad symbol table");
-			}
- 			bv = runtime·stackmapdata(stackmap, pcdata);
-		}
- 		scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata);
- 	}
- 	return true;
-}
-
-static void
-scanstack(G *gp)
-{
-	M *mp;
-	bool (*fn)(Stkframe*, void*);
-
-	if(runtime·readgstatus(gp)&Gscan == 0) {
-		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
-		runtime·throw("mark - bad status");
-	}
-
-	switch(runtime·readgstatus(gp)&~Gscan) {
-	default:
-		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
-		runtime·throw("mark - bad status");
-	case Gdead:
-		return;
-	case Grunning:
-		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
-		runtime·throw("mark - world not stopped");
-	case Grunnable:
-	case Gsyscall:
-	case Gwaiting:
-		break;
-	}
-
-	if(gp == g)
-		runtime·throw("can't scan our own stack");
-	if((mp = gp->m) != nil && mp->helpgc)
-		runtime·throw("can't scan gchelper stack");
-
-	fn = scanframe;
-	runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, &fn, nil, 0);
-	runtime·tracebackdefers(gp, &fn, nil);
-}
-
-// The gp has been moved to a gc safepoint. If there is gcphase specific
-// work it is done here. 
-void
-runtime·gcphasework(G *gp)
-{
-	switch(runtime·gcphase) {
-	default:
-		runtime·throw("gcphasework in bad gcphase");
-	case GCoff:
-	case GCquiesce:
-	case GCstw:
-	case GCsweep:
-		// No work for now.
-		break;
-	case GCmark:
-		// Disabled until concurrent GC is implemented
-		// but indicate the scan has been done. 
-		// scanstack(gp);
-		break;
-	}
-	gp->gcworkdone = true;
-}
-
-#pragma dataflag NOPTR
-static byte finalizer1[] = {
-	// Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
-	// Each byte describes 4 words.
-	// Need 4 Finalizers described by 5 bytes before pattern repeats:
-	//	ptr ptr uintptr ptr ptr
-	//	ptr ptr uintptr ptr ptr
-	//	ptr ptr uintptr ptr ptr
-	//	ptr ptr uintptr ptr ptr
-	// aka
-	//	ptr ptr uintptr ptr
-	//	ptr ptr ptr uintptr
-	//	ptr ptr ptr ptr
-	//	uintptr ptr ptr ptr
-	//	ptr uintptr ptr ptr
-	// Assumptions about Finalizer layout checked below.
-	BitsPointer | BitsPointer<<2 | BitsScalar<<4 | BitsPointer<<6,
-	BitsPointer | BitsPointer<<2 | BitsPointer<<4 | BitsScalar<<6,
-	BitsPointer | BitsPointer<<2 | BitsPointer<<4 | BitsPointer<<6,
-	BitsScalar | BitsPointer<<2 | BitsPointer<<4 | BitsPointer<<6,
-	BitsPointer | BitsScalar<<2 | BitsPointer<<4 | BitsPointer<<6,
-};
-
-void
-runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
-{
-	FinBlock *block;
-	Finalizer *f;
-	int32 i;
-
-	runtime·lock(&runtime·finlock);
-	if(runtime·finq == nil || runtime·finq->cnt == runtime·finq->cap) {
-		if(runtime·finc == nil) {
-			runtime·finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
-			runtime·finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
-			runtime·finc->alllink = runtime·allfin;
-			runtime·allfin = runtime·finc;
-			if(finptrmask[0] == 0) {
-				// Build pointer mask for Finalizer array in block.
-				// Check assumptions made in finalizer1 array above.
-				if(sizeof(Finalizer) != 5*PtrSize ||
-					offsetof(Finalizer, fn) != 0 ||
-					offsetof(Finalizer, arg) != PtrSize ||
-					offsetof(Finalizer, nret) != 2*PtrSize ||
-					offsetof(Finalizer, fint) != 3*PtrSize ||
-					offsetof(Finalizer, ot) != 4*PtrSize ||
-					BitsPerPointer != 2) {
-					runtime·throw("finalizer out of sync");
-				}
-				for(i=0; i<nelem(finptrmask); i++)
-					finptrmask[i] = finalizer1[i%nelem(finalizer1)];
-			}
-		}
-		block = runtime·finc;
-		runtime·finc = block->next;
-		block->next = runtime·finq;
-		runtime·finq = block;
-	}
-	f = &runtime·finq->fin[runtime·finq->cnt];
-	runtime·finq->cnt++;
-	f->fn = fn;
-	f->nret = nret;
-	f->fint = fint;
-	f->ot = ot;
-	f->arg = p;
-	runtime·fingwake = true;
-	runtime·unlock(&runtime·finlock);
-}
-
-void
-runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*))
-{
-	FinBlock *fb;
-	Finalizer *f;
-	uintptr i;
-
-	for(fb = runtime·allfin; fb; fb = fb->alllink) {
-		for(i = 0; i < fb->cnt; i++) {
-			f = &fb->fin[i];
-			callback(f->fn, f->arg, f->nret, f->fint, f->ot);
-		}
-	}
-}
-
-void
-runtime·MSpan_EnsureSwept(MSpan *s)
-{
-	uint32 sg;
-
-	// Caller must disable preemption.
-	// Otherwise when this function returns the span can become unswept again
-	// (if GC is triggered on another goroutine).
-	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
-		runtime·throw("MSpan_EnsureSwept: m is not locked");
-
-	sg = runtime·mheap.sweepgen;
-	if(runtime·atomicload(&s->sweepgen) == sg)
-		return;
-	if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
-		runtime·MSpan_Sweep(s, false);
-		return;
-	}
-	// unfortunate condition, and we don't have efficient means to wait
-	while(runtime·atomicload(&s->sweepgen) != sg)
-		runtime·osyield();
-}
-
-// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-// It clears the mark bits in preparation for the next GC round.
-// Returns true if the span was returned to heap.
-// If preserve=true, don't return it to heap nor relink in MCentral lists;
-// caller takes care of it.
-bool
-runtime·MSpan_Sweep(MSpan *s, bool preserve)
-{
-	int32 cl, n, npages, nfree;
-	uintptr size, off, step;
-	uint32 sweepgen;
-	byte *p, *bitp, shift, xbits, bits;
-	MCache *c;
-	byte *arena_start;
-	MLink head, *end, *link;
-	Special *special, **specialp, *y;
-	bool res, sweepgenset;
-
-	// It's critical that we enter this function with preemption disabled,
-	// GC must not start while we are in the middle of this function.
-	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
-		runtime·throw("MSpan_Sweep: m is not locked");
-	sweepgen = runtime·mheap.sweepgen;
-	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-		runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-			s->state, s->sweepgen, sweepgen);
-		runtime·throw("MSpan_Sweep: bad span state");
-	}
-	arena_start = runtime·mheap.arena_start;
-	cl = s->sizeclass;
-	size = s->elemsize;
-	if(cl == 0) {
-		n = 1;
-	} else {
-		// Chunk full of small blocks.
-		npages = runtime·class_to_allocnpages[cl];
-		n = (npages << PageShift) / size;
-	}
-	res = false;
-	nfree = 0;
-	end = &head;
-	c = g->m->mcache;
-	sweepgenset = false;
-
-	// Mark any free objects in this span so we don't collect them.
-	for(link = s->freelist; link != nil; link = link->next) {
-		off = (uintptr*)link - (uintptr*)arena_start;
-		bitp = arena_start - off/wordsPerBitmapByte - 1;
-		shift = (off % wordsPerBitmapByte) * gcBits;
-		*bitp |= bitMarked<<shift;
-	}
-
-	// Unlink & free special records for any objects we're about to free.
-	specialp = &s->specials;
-	special = *specialp;
-	while(special != nil) {
-		// A finalizer can be set for an inner byte of an object, find object beginning.
-		p = (byte*)(s->start << PageShift) + special->offset/size*size;
-		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = arena_start - off/wordsPerBitmapByte - 1;
-		shift = (off % wordsPerBitmapByte) * gcBits;
-		bits = (*bitp>>shift) & bitMask;
-		if((bits&bitMarked) == 0) {
-			// Find the exact byte for which the special was setup
-			// (as opposed to object beginning).
-			p = (byte*)(s->start << PageShift) + special->offset;
-			// about to free object: splice out special record
-			y = special;
-			special = special->next;
-			*specialp = special;
-			if(!runtime·freespecial(y, p, size, false)) {
-				// stop freeing of object if it has a finalizer
-				*bitp |= bitMarked << shift;
-			}
-		} else {
-			// object is still live: keep special record
-			specialp = &special->next;
-			special = *specialp;
-		}
-	}
-
-	// Sweep through n objects of given size starting at p.
-	// This thread owns the span now, so it can manipulate
-	// the block bitmap without atomic operations.
-	p = (byte*)(s->start << PageShift);
-	// Find bits for the beginning of the span.
-	off = (uintptr*)p - (uintptr*)arena_start;
-	bitp = arena_start - off/wordsPerBitmapByte - 1;
-	shift = 0;
-	step = size/(PtrSize*wordsPerBitmapByte);
-	// Rewind to the previous quadruple as we move to the next
-	// in the beginning of the loop.
-	bitp += step;
-	if(step == 0) {
-		// 8-byte objects.
-		bitp++;
-		shift = gcBits;
-	}
-	for(; n > 0; n--, p += size) {
-		bitp -= step;
-		if(step == 0) {
-			if(shift != 0)
-				bitp--;
-			shift = gcBits - shift;
-		}
-
-		xbits = *bitp;
-		bits = (xbits>>shift) & bitMask;
-
-		// Allocated and marked object, reset bits to allocated.
-		if((bits&bitMarked) != 0) {
-			*bitp &= ~(bitMarked<<shift);
-			continue;
-		}
-		// At this point we know that we are looking at garbage object
-		// that needs to be collected.
-		if(runtime·debug.allocfreetrace)
-			runtime·tracefree(p, size);
-		// Reset to allocated+noscan.
-		*bitp = (xbits & ~((bitMarked|(BitsMask<<2))<<shift)) | ((uintptr)BitsDead<<(shift+2));
-		if(cl == 0) {
-			// Free large span.
-			if(preserve)
-				runtime·throw("can't preserve large span");
-			runtime·unmarkspan(p, s->npages<<PageShift);
-			s->needzero = 1;
-			// important to set sweepgen before returning it to heap
-			runtime·atomicstore(&s->sweepgen, sweepgen);
-			sweepgenset = true;
-			// NOTE(rsc,dvyukov): The original implementation of efence
-			// in CL 22060046 used SysFree instead of SysFault, so that
-			// the operating system would eventually give the memory
-			// back to us again, so that an efence program could run
-			// longer without running out of memory. Unfortunately,
-			// calling SysFree here without any kind of adjustment of the
-			// heap data structures means that when the memory does
-			// come back to us, we have the wrong metadata for it, either in
-			// the MSpan structures or in the garbage collection bitmap.
-			// Using SysFault here means that the program will run out of
-			// memory fairly quickly in efence mode, but at least it won't
-			// have mysterious crashes due to confused memory reuse.
-			// It should be possible to switch back to SysFree if we also
-			// implement and then call some kind of MHeap_DeleteSpan.
-			if(runtime·debug.efence) {
-				s->limit = nil;	// prevent mlookup from finding this span
-				runtime·SysFault(p, size);
-			} else
-				runtime·MHeap_Free(&runtime·mheap, s, 1);
-			c->local_nlargefree++;
-			c->local_largefree += size;
-			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (runtime·gcpercent + 100)/100));
-			res = true;
-		} else {
-			// Free small object.
-			if(size > 2*sizeof(uintptr))
-				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
-			else if(size > sizeof(uintptr))
-				((uintptr*)p)[1] = 0;
-
-			end->next = (MLink*)p;
-			end = (MLink*)p;
-			nfree++;
-		}
-	}
-
-	// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
-	// because of the potential for a concurrent free/SetFinalizer.
-	// But we need to set it before we make the span available for allocation
-	// (return it to heap or mcentral), because allocation code assumes that a
-	// span is already swept if available for allocation.
-
-	if(!sweepgenset && nfree == 0) {
-		// The span must be in our exclusive ownership until we update sweepgen,
-		// check for potential races.
-		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-			runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-				s->state, s->sweepgen, sweepgen);
-			runtime·throw("MSpan_Sweep: bad span state after sweep");
-		}
-		runtime·atomicstore(&s->sweepgen, sweepgen);
-	}
-	if(nfree > 0) {
-		c->local_nsmallfree[cl] += nfree;
-		c->local_cachealloc -= nfree * size;
-		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (runtime·gcpercent + 100)/100));
-		res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl].mcentral, s, nfree, head.next, end, preserve);
-		// MCentral_FreeSpan updates sweepgen
-	}
-	return res;
-}
-
-// State of background runtime·sweep.
-// Protected by runtime·gclock.
-typedef struct SweepData SweepData;
-struct SweepData
-{
-	G*	g;
-	bool	parked;
-
-	uint32	spanidx;	// background sweeper position
-
-	uint32	nbgsweep;
-	uint32	npausesweep;
-};
-SweepData runtime·sweep;
-
-// sweeps one span
-// returns number of pages returned to heap, or -1 if there is nothing to sweep
-uintptr
-runtime·sweepone(void)
-{
-	MSpan *s;
-	uint32 idx, sg;
-	uintptr npages;
-
-	// increment locks to ensure that the goroutine is not preempted
-	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
-	g->m->locks++;
-	sg = runtime·mheap.sweepgen;
-	for(;;) {
-		idx = runtime·xadd(&runtime·sweep.spanidx, 1) - 1;
-		if(idx >= runtime·work.nspan) {
-			runtime·mheap.sweepdone = true;
-			g->m->locks--;
-			return -1;
-		}
-		s = runtime·work.spans[idx];
-		if(s->state != MSpanInUse) {
-			s->sweepgen = sg;
-			continue;
-		}
-		if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
-			continue;
-		npages = s->npages;
-		if(!runtime·MSpan_Sweep(s, false))
-			npages = 0;
-		g->m->locks--;
-		return npages;
-	}
-}
-
-static void
-sweepone_m(void)
-{
-	g->m->scalararg[0] = runtime·sweepone();
-}
-
-#pragma textflag NOSPLIT
-uintptr
-runtime·gosweepone(void)
-{
-	void (*fn)(void);
-	
-	fn = sweepone_m;
-	runtime·onM(&fn);
-	return g->m->scalararg[0];
-}
-
-#pragma textflag NOSPLIT
-bool
-runtime·gosweepdone(void)
-{
-	return runtime·mheap.sweepdone;
-}
-
-void
-runtime·gchelper(void)
-{
-	uint32 nproc;
-
-	g->m->traceback = 2;
-	gchelperstart();
-
-	// parallel mark for over gc roots
-	runtime·parfordo(runtime·work.markfor);
-
-	// help other threads scan secondary blocks
-	scanblock(nil, 0, nil);
-
-	nproc = runtime·work.nproc;  // runtime·work.nproc can change right after we increment runtime·work.ndone
-	if(runtime·xadd(&runtime·work.ndone, +1) == nproc-1)
-		runtime·notewakeup(&runtime·work.alldone);
-	g->m->traceback = 0;
-}
-
-static void
-cachestats(void)
-{
-	MCache *c;
-	P *p, **pp;
-
-	for(pp=runtime·allp; p=*pp; pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		runtime·purgecachedstats(c);
-	}
-}
-
-static void
-flushallmcaches(void)
-{
-	P *p, **pp;
-	MCache *c;
-
-	// Flush MCache's to MCentral.
-	for(pp=runtime·allp; p=*pp; pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		runtime·MCache_ReleaseAll(c);
-		runtime·stackcache_clear(c);
-	}
-}
-
-static void
-flushallmcaches_m(G *gp)
-{
-	flushallmcaches();
-	runtime·gogo(&gp->sched);
-}
-
-void
-runtime·updatememstats(GCStats *stats)
-{
-	M *mp;
-	MSpan *s;
-	int32 i;
-	uint64 smallfree;
-	uint64 *src, *dst;
-	void (*fn)(G*);
-
-	if(stats)
-		runtime·memclr((byte*)stats, sizeof(*stats));
-	for(mp=runtime·allm; mp; mp=mp->alllink) {
-		if(stats) {
-			src = (uint64*)&mp->gcstats;
-			dst = (uint64*)stats;
-			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
-				dst[i] += src[i];
-			runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
-		}
-	}
-	mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
-	mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
-	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
-		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
-	
-	// Calculate memory allocator stats.
-	// During program execution we only count number of frees and amount of freed memory.
-	// Current number of alive object in the heap and amount of alive heap memory
-	// are calculated by scanning all spans.
-	// Total number of mallocs is calculated as number of frees plus number of alive objects.
-	// Similarly, total amount of allocated memory is calculated as amount of freed memory
-	// plus amount of alive heap memory.
-	mstats.alloc = 0;
-	mstats.total_alloc = 0;
-	mstats.nmalloc = 0;
-	mstats.nfree = 0;
-	for(i = 0; i < nelem(mstats.by_size); i++) {
-		mstats.by_size[i].nmalloc = 0;
-		mstats.by_size[i].nfree = 0;
-	}
-
-	// Flush MCache's to MCentral.
-	if(g == g->m->g0)
-		flushallmcaches();
-	else {
-		fn = flushallmcaches_m;
-		runtime·mcall(&fn);
-	}
-
-	// Aggregate local stats.
-	cachestats();
-
-	// Scan all spans and count number of alive objects.
-	runtime·lock(&runtime·mheap.lock);
-	for(i = 0; i < runtime·mheap.nspan; i++) {
-		s = runtime·mheap.allspans[i];
-		if(s->state != MSpanInUse)
-			continue;
-		if(s->sizeclass == 0) {
-			mstats.nmalloc++;
-			mstats.alloc += s->elemsize;
-		} else {
-			mstats.nmalloc += s->ref;
-			mstats.by_size[s->sizeclass].nmalloc += s->ref;
-			mstats.alloc += s->ref*s->elemsize;
-		}
-	}
-	runtime·unlock(&runtime·mheap.lock);
-
-	// Aggregate by size class.
-	smallfree = 0;
-	mstats.nfree = runtime·mheap.nlargefree;
-	for(i = 0; i < nelem(mstats.by_size); i++) {
-		mstats.nfree += runtime·mheap.nsmallfree[i];
-		mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
-		mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
-		smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
-	}
-	mstats.nfree += mstats.tinyallocs;
-	mstats.nmalloc += mstats.nfree;
-
-	// Calculate derived stats.
-	mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
-	mstats.heap_alloc = mstats.alloc;
-	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
-}
-
-// Structure of arguments passed to function gc().
-// This allows the arguments to be passed via runtime·mcall.
-struct gc_args
-{
-	int64 start_time; // start time of GC in ns (just before stoptheworld)
-	bool  eagersweep;
-};
-
-static void gc(struct gc_args *args);
-
-int32
-runtime·readgogc(void)
-{
-	byte *p;
-
-	p = runtime·getenv("GOGC");
-	if(p == nil || p[0] == '\0')
-		return 100;
-	if(runtime·strcmp(p, (byte*)"off") == 0)
-		return -1;
-	return runtime·atoi(p);
-}
-
-void
-runtime·gcinit(void)
-{
-	if(sizeof(Workbuf) != WorkbufSize)
-		runtime·throw("runtime: size of Workbuf is suboptimal");
-
-	runtime·work.markfor = runtime·parforalloc(MaxGcproc);
-	runtime·gcpercent = runtime·readgogc();
-	runtime·gcdatamask = unrollglobgcprog(runtime·gcdata, runtime·edata - runtime·data);
-	runtime·gcbssmask = unrollglobgcprog(runtime·gcbss, runtime·ebss - runtime·bss);
-}
-
-void
-runtime·gc_m(void)
-{
-	struct gc_args a;
-	G *gp;
-
-	gp = g->m->curg;
-	runtime·casgstatus(gp, Grunning, Gwaiting);
-	gp->waitreason = runtime·gostringnocopy((byte*)"garbage collection");
-
-	a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32);
-	a.eagersweep = g->m->scalararg[2];
-	gc(&a);
-
-	if(nbadblock > 0) {
-		// Work out path from root to bad block.
-		for(;;) {
-			gc(&a);
-			if(nbadblock >= nelem(badblock))
-				runtime·throw("cannot find path to bad pointer");
-		}
-	}
-
-	runtime·casgstatus(gp, Gwaiting, Grunning);
-}
-
-static void
-gc(struct gc_args *args)
-{
-	int64 t0, t1, t2, t3, t4;
-	uint64 heap0, heap1, obj;
-	GCStats stats;
-
-	if(DebugPtrs)
-		runtime·printf("GC start\n");
-
-	if(runtime·debug.allocfreetrace)
-		runtime·tracegc();
-
-	g->m->traceback = 2;
-	t0 = args->start_time;
-	runtime·work.tstart = args->start_time; 
-
-	t1 = 0;
-	if(runtime·debug.gctrace)
-		t1 = runtime·nanotime();
-
-	// Sweep what is not sweeped by bgsweep.
-	while(runtime·sweepone() != -1)
-		runtime·sweep.npausesweep++;
-
-	// Cache runtime.mheap.allspans in work.spans to avoid conflicts with
-	// resizing/freeing allspans.
-	// New spans can be created while GC progresses, but they are not garbage for
-	// this round:
-	//  - new stack spans can be created even while the world is stopped.
-	//  - new malloc spans can be created during the concurrent sweep
-
-	// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
-	runtime·lock(&runtime·mheap.lock);
-	// Free the old cached sweep array if necessary.
-	if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans)
-		runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys);
-	// Cache the current array for marking.
-	runtime·mheap.gcspans = runtime·mheap.allspans;
-	runtime·work.spans = runtime·mheap.allspans;
-	runtime·work.nspan = runtime·mheap.nspan;
-	runtime·unlock(&runtime·mheap.lock);
-
-	runtime·work.nwait = 0;
-	runtime·work.ndone = 0;
-	runtime·work.nproc = runtime·gcprocs();
-	runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + runtime·allglen, nil, false, markroot);
-	if(runtime·work.nproc > 1) {
-		runtime·noteclear(&runtime·work.alldone);
-		runtime·helpgc(runtime·work.nproc);
-	}
-
-	t2 = 0;
-	if(runtime·debug.gctrace)
-		t2 = runtime·nanotime();
-
-	gchelperstart();
-	runtime·parfordo(runtime·work.markfor);
-	scanblock(nil, 0, nil);
-
-	t3 = 0;
-	if(runtime·debug.gctrace)
-		t3 = runtime·nanotime();
-
-	if(runtime·work.nproc > 1)
-		runtime·notesleep(&runtime·work.alldone);
-
-	runtime·shrinkfinish();
-
-	cachestats();
-	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
-	// estimate what was live heap size after previous GC (for tracing only)
-	heap0 = mstats.next_gc*100/(runtime·gcpercent+100);
-	// conservatively set next_gc to high value assuming that everything is live
-	// concurrent/lazy sweep will reduce this number while discovering new garbage
-	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*runtime·gcpercent/100;
-
-	t4 = runtime·nanotime();
-	runtime·atomicstore64(&mstats.last_gc, runtime·unixnanotime());  // must be Unix time to make sense to user
-	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
-	mstats.pause_end[mstats.numgc%nelem(mstats.pause_end)] = t4;
-	mstats.pause_total_ns += t4 - t0;
-	mstats.numgc++;
-	if(mstats.debuggc)
-		runtime·printf("pause %D\n", t4-t0);
-
-	if(runtime·debug.gctrace) {
-		heap1 = mstats.heap_alloc;
-		runtime·updatememstats(&stats);
-		if(heap1 != mstats.heap_alloc) {
-			runtime·printf("runtime: mstats skew: heap=%D/%D\n", heap1, mstats.heap_alloc);
-			runtime·throw("mstats skew");
-		}
-		obj = mstats.nmalloc - mstats.nfree;
-
-		stats.nprocyield += runtime·work.markfor->nprocyield;
-		stats.nosyield += runtime·work.markfor->nosyield;
-		stats.nsleep += runtime·work.markfor->nsleep;
-
-		runtime·printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
-				" %d goroutines,"
-				" %d/%d/%d sweeps,"
-				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-			mstats.numgc, runtime·work.nproc, (t1-t0)/1000, (t2-t1)/1000, (t3-t2)/1000, (t4-t3)/1000,
-			heap0>>20, heap1>>20, obj,
-			mstats.nmalloc, mstats.nfree,
-			runtime·gcount(),
-			runtime·work.nspan, runtime·sweep.nbgsweep, runtime·sweep.npausesweep,
-			stats.nhandoff, stats.nhandoffcnt,
-			runtime·work.markfor->nsteal, runtime·work.markfor->nstealcnt,
-			stats.nprocyield, stats.nosyield, stats.nsleep);
-		runtime·sweep.nbgsweep = runtime·sweep.npausesweep = 0;
-	}
-
-	// See the comment in the beginning of this function as to why we need the following.
-	// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
-	runtime·lock(&runtime·mheap.lock);
-	// Free the old cached mark array if necessary.
-	if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans)
-		runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys);
-	// Cache the current array for sweeping.
-	runtime·mheap.gcspans = runtime·mheap.allspans;
-	runtime·mheap.sweepgen += 2;
-	runtime·mheap.sweepdone = false;
-	runtime·work.spans = runtime·mheap.allspans;
-	runtime·work.nspan = runtime·mheap.nspan;
-	runtime·sweep.spanidx = 0;
-	runtime·unlock(&runtime·mheap.lock);
-
-	if(ConcurrentSweep && !args->eagersweep) {
-		runtime·lock(&runtime·gclock);
-		if(runtime·sweep.g == nil)
-			runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc);
-		else if(runtime·sweep.parked) {
-			runtime·sweep.parked = false;
-			runtime·ready(runtime·sweep.g);
-		}
-		runtime·unlock(&runtime·gclock);
-	} else {
-		// Sweep all spans eagerly.
-		while(runtime·sweepone() != -1)
-			runtime·sweep.npausesweep++;
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		runtime·mProf_GC();
-	}
-
-	runtime·mProf_GC();
-	g->m->traceback = 0;
-
-	if(DebugPtrs)
-		runtime·printf("GC end\n");
-}
-
-extern uintptr runtime·sizeof_C_MStats;
-
-static void readmemstats_m(void);
-
-void
-runtime·readmemstats_m(void)
-{
-	MStats *stats;
-	
-	stats = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-
-	runtime·updatememstats(nil);
-	// Size of the trailing by_size array differs between Go and C,
-	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
-	runtime·memmove(stats, &mstats, runtime·sizeof_C_MStats);
-
-	// Stack numbers are part of the heap numbers, separate those out for user consumption
-	stats->stacks_sys = stats->stacks_inuse;
-	stats->heap_inuse -= stats->stacks_inuse;
-	stats->heap_sys -= stats->stacks_inuse;
-}
-
-static void readgcstats_m(void);
-
-#pragma textflag NOSPLIT
-void
-runtime∕debug·readGCStats(Slice *pauses)
-{
-	void (*fn)(void);
-	
-	g->m->ptrarg[0] = pauses;
-	fn = readgcstats_m;
-	runtime·onM(&fn);
-}
-
-static void
-readgcstats_m(void)
-{
-	Slice *pauses;	
-	uint64 *p;
-	uint32 i, j, n;
-	
-	pauses = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-
-	// Calling code in runtime/debug should make the slice large enough.
-	if(pauses->cap < nelem(mstats.pause_ns)+3)
-		runtime·throw("runtime: short slice passed to readGCStats");
-
-	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
-	p = (uint64*)pauses->array;
-	runtime·lock(&runtime·mheap.lock);
-
-	n = mstats.numgc;
-	if(n > nelem(mstats.pause_ns))
-		n = nelem(mstats.pause_ns);
-
-	// The pause buffer is circular. The most recent pause is at
-	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
-	// from there to go back farther in time. We deliver the times
-	// most recent first (in p[0]).
-	for(i=0; i<n; i++) {
-		j = (mstats.numgc-1-i)%nelem(mstats.pause_ns);
-		p[i] = mstats.pause_ns[j];
-		p[n+i] = mstats.pause_end[j];
-	}
-
-	p[n+n] = mstats.last_gc;
-	p[n+n+1] = mstats.numgc;
-	p[n+n+2] = mstats.pause_total_ns;	
-	runtime·unlock(&runtime·mheap.lock);
-	pauses->len = n+n+3;
-}
-
-void
-runtime·setgcpercent_m(void)
-{
-	int32 in;
-	int32 out;
-
-	in = (int32)(intptr)g->m->scalararg[0];
-
-	runtime·lock(&runtime·mheap.lock);
-	out = runtime·gcpercent;
-	if(in < 0)
-		in = -1;
-	runtime·gcpercent = in;
-	runtime·unlock(&runtime·mheap.lock);
-
-	g->m->scalararg[0] = (uintptr)(intptr)out;
-}
-
-static void
-gchelperstart(void)
-{
-	if(g->m->helpgc < 0 || g->m->helpgc >= MaxGcproc)
-		runtime·throw("gchelperstart: bad m->helpgc");
-	if(g != g->m->g0)
-		runtime·throw("gchelper not running on g0 stack");
-}
-
-G*
-runtime·wakefing(void)
-{
-	G *res;
-
-	res = nil;
-	runtime·lock(&runtime·finlock);
-	if(runtime·fingwait && runtime·fingwake) {
-		runtime·fingwait = false;
-		runtime·fingwake = false;
-		res = runtime·fing;
-	}
-	runtime·unlock(&runtime·finlock);
-	return res;
-}
-
-// Recursively unrolls GC program in prog.
-// mask is where to store the result.
-// ppos is a pointer to position in mask, in bits.
-// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
-static byte*
-unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
-{
-	uintptr pos, siz, i, off;
-	byte *arena_start, *prog1, v, *bitp, shift;
-
-	arena_start = runtime·mheap.arena_start;
-	pos = *ppos;
-	for(;;) {
-		switch(prog[0]) {
-		case insData:
-			prog++;
-			siz = prog[0];
-			prog++;
-			for(i = 0; i < siz; i++) {
-				v = prog[i/PointersPerByte];
-				v >>= (i%PointersPerByte)*BitsPerPointer;
-				v &= BitsMask;
-				if(inplace) {
-					// Store directly into GC bitmap.
-					off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
-					bitp = arena_start - off/wordsPerBitmapByte - 1;
-					shift = (off % wordsPerBitmapByte) * gcBits;
-					if(shift==0)
-						*bitp = 0;
-					*bitp |= v<<(shift+2);
-					pos += PtrSize;
-				} else if(sparse) {
-					// 4-bits per word
-					v <<= (pos%8)+2;
-					mask[pos/8] |= v;
-					pos += gcBits;
-				} else {
-					// 2-bits per word
-					v <<= pos%8;
-					mask[pos/8] |= v;
-					pos += BitsPerPointer;
-				}
-			}
-			prog += ROUND(siz*BitsPerPointer, 8)/8;
-			break;
-		case insArray:
-			prog++;
-			siz = 0;
-			for(i = 0; i < PtrSize; i++)
-				siz = (siz<<8) + prog[PtrSize-i-1];
-			prog += PtrSize;
-			prog1 = nil;
-			for(i = 0; i < siz; i++)
-				prog1 = unrollgcprog1(mask, prog, &pos, inplace, sparse);
-			if(prog1[0] != insArrayEnd)
-				runtime·throw("unrollgcprog: array does not end with insArrayEnd");
-			prog = prog1+1;
-			break;
-		case insArrayEnd:
-		case insEnd:
-			*ppos = pos;
-			return prog;
-		default:
-			runtime·throw("unrollgcprog: unknown instruction");
-		}
-	}
-}
-
-// Unrolls GC program prog for data/bss, returns dense GC mask.
-static BitVector
-unrollglobgcprog(byte *prog, uintptr size)
-{
-	byte *mask;
-	uintptr pos, masksize;
-
-	masksize = ROUND(ROUND(size, PtrSize)/PtrSize*BitsPerPointer, 8)/8;
-	mask = runtime·persistentalloc(masksize+1, 0, &mstats.gc_sys);
-	mask[masksize] = 0xa1;
-	pos = 0;
-	prog = unrollgcprog1(mask, prog, &pos, false, false);
-	if(pos != size/PtrSize*BitsPerPointer) {
-		runtime·printf("unrollglobgcprog: bad program size, got %D, expect %D\n",
-			(uint64)pos, (uint64)size/PtrSize*BitsPerPointer);
-		runtime·throw("unrollglobgcprog: bad program size");
-	}
-	if(prog[0] != insEnd)
-		runtime·throw("unrollglobgcprog: program does not end with insEnd");
-	if(mask[masksize] != 0xa1)
-		runtime·throw("unrollglobgcprog: overflow");
-	return (BitVector){masksize*8, mask};
-}
-
-void
-runtime·unrollgcproginplace_m(void)
-{
-	uintptr size, size0, pos, off;
-	byte *arena_start, *prog, *bitp, shift;
-	Type *typ;
-	void *v;
-
-	v = g->m->ptrarg[0];
-	typ = g->m->ptrarg[1];
-	size = g->m->scalararg[0];
-	size0 = g->m->scalararg[1];
-	g->m->ptrarg[0] = nil;
-	g->m->ptrarg[1] = nil;
-
-	pos = 0;
-	prog = (byte*)typ->gc[1];
-	while(pos != size0)
-		unrollgcprog1(v, prog, &pos, true, true);
-	// Mark first word as bitAllocated.
-	arena_start = runtime·mheap.arena_start;
-	off = (uintptr*)v - (uintptr*)arena_start;
-	bitp = arena_start - off/wordsPerBitmapByte - 1;
-	shift = (off % wordsPerBitmapByte) * gcBits;
-	*bitp |= bitBoundary<<shift;
-	// Mark word after last as BitsDead.
-	if(size0 < size) {
-		off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-		bitp = arena_start - off/wordsPerBitmapByte - 1;
-		shift = (off % wordsPerBitmapByte) * gcBits;
-		*bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
-	}
-}
-
-// Unrolls GC program in typ->gc[1] into typ->gc[0]
-void
-runtime·unrollgcprog_m(void)
-{
-	static Mutex lock;
-	Type *typ;
-	byte *mask, *prog;
-	uintptr pos;
-	uint32 x;
-
-	typ = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-
-	runtime·lock(&lock);
-	mask = (byte*)typ->gc[0];
-	if(mask[0] == 0) {
-		pos = 8;  // skip the unroll flag
-		prog = (byte*)typ->gc[1];
-		prog = unrollgcprog1(mask, prog, &pos, false, true);
-		if(prog[0] != insEnd)
-			runtime·throw("unrollgcprog: program does not end with insEnd");
-		if(((typ->size/PtrSize)%2) != 0) {
-			// repeat the program twice
-			prog = (byte*)typ->gc[1];
-			unrollgcprog1(mask, prog, &pos, false, true);
-		}
-		// atomic way to say mask[0] = 1
-		x = ((uint32*)mask)[0];
-		runtime·atomicstore((uint32*)mask, x|1);
-	}
-	runtime·unlock(&lock);
-}
-
-// mark the span of memory at v as having n blocks of the given size.
-// if leftover is true, there is left over space at the end of the span.
-void
-runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
-{
-	uintptr i, off, step;
-	byte *b;
-
-	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-		runtime·throw("markspan: bad pointer");
-
-	// Find bits of the beginning of the span.
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
-	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
-	if((off%wordsPerBitmapByte) != 0)
-		runtime·throw("markspan: unaligned length");
-
-	// Okay to use non-atomic ops here, because we control
-	// the entire span, and each bitmap byte has bits for only
-	// one span, so no other goroutines are changing these bitmap words.
-
-	if(size == PtrSize) {
-		// Possible only on 64-bits (minimal size class is 8 bytes).
-		// Poor man's memset(0x11).
-		if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
-			runtime·throw("markspan: bad bits");
-		if((n%(wordsPerBitmapByte*PtrSize)) != 0)
-			runtime·throw("markspan: unaligned length");
-		b = b - n/wordsPerBitmapByte + 1;	// find first byte
-		if(((uintptr)b%PtrSize) != 0)
-			runtime·throw("markspan: unaligned pointer");
-		for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
-			*(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
-		return;
-	}
-
-	if(leftover)
-		n++;	// mark a boundary just past end of last block too
-	step = size/(PtrSize*wordsPerBitmapByte);
-	for(i = 0; i != n; i++, b -= step)
-		*b = bitBoundary|(BitsDead<<2);
-}
-
-// unmark the span of memory at v of length n bytes.
-void
-runtime·unmarkspan(void *v, uintptr n)
-{
-	uintptr off;
-	byte *b;
-
-	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
-		runtime·throw("markspan: bad pointer");
-
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
-	if((off % (PtrSize*wordsPerBitmapByte)) != 0)
-		runtime·throw("markspan: unaligned pointer");
-	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
-	n /= PtrSize;
-	if(n%(PtrSize*wordsPerBitmapByte) != 0)
-		runtime·throw("unmarkspan: unaligned length");
-	// Okay to use non-atomic ops here, because we control
-	// the entire span, and each bitmap word has bits for only
-	// one span, so no other goroutines are changing these
-	// bitmap words.
-	n /= wordsPerBitmapByte;
-	runtime·memclr(b - n + 1, n);
-}
-
-void
-runtime·MHeap_MapBits(MHeap *h)
-{
-	// Caller has added extra mappings to the arena.
-	// Add extra mappings of bitmap words as needed.
-	// We allocate extra bitmap pieces in chunks of bitmapChunk.
-	enum {
-		bitmapChunk = 8192
-	};
-	uintptr n;
-
-	n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
-	n = ROUND(n, bitmapChunk);
-	n = ROUND(n, PhysPageSize);
-	if(h->bitmap_mapped >= n)
-		return;
-
-	runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats.gc_sys);
-	h->bitmap_mapped = n;
-}
-
-static bool
-getgcmaskcb(Stkframe *frame, void *ctxt)
-{
-	Stkframe *frame0;
-
-	frame0 = ctxt;
-	if(frame->sp <= frame0->sp && frame0->sp < frame->varp) {
-		*frame0 = *frame;
-		return false;
-	}
-	return true;
-}
-
-// Returns GC type info for object p for testing.
-void
-runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
-{
-	Stkframe frame;
-	uintptr i, n, off;
-	byte *base, bits, shift, *b;
-	bool (*cb)(Stkframe*, void*);
-
-	*mask = nil;
-	*len = 0;
-
-	// data
-	if(p >= runtime·data && p < runtime·edata) {
-		n = ((PtrType*)t)->elem->size;
-		*len = n/PtrSize;
-		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
-		for(i = 0; i < n; i += PtrSize) {
-			off = (p+i-runtime·data)/PtrSize;
-			bits = (runtime·gcdatamask.bytedata[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
-			(*mask)[i/PtrSize] = bits;
-		}
-		return;
-	}
-	// bss
-	if(p >= runtime·bss && p < runtime·ebss) {
-		n = ((PtrType*)t)->elem->size;
-		*len = n/PtrSize;
-		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
-		for(i = 0; i < n; i += PtrSize) {
-			off = (p+i-runtime·bss)/PtrSize;
-			bits = (runtime·gcbssmask.bytedata[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
-			(*mask)[i/PtrSize] = bits;
-		}
-		return;
-	}
-	// heap
-	if(runtime·mlookup(p, &base, &n, nil)) {
-		*len = n/PtrSize;
-		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
-		for(i = 0; i < n; i += PtrSize) {
-			off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
-			b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
-			shift = (off % wordsPerBitmapByte) * gcBits;
-			bits = (*b >> (shift+2))&BitsMask;
-			(*mask)[i/PtrSize] = bits;
-		}
-		return;
-	}
-	// stack
-	frame.fn = nil;
-	frame.sp = (uintptr)p;
-	cb = getgcmaskcb;
-	runtime·gentraceback(g->m->curg->sched.pc, g->m->curg->sched.sp, 0, g->m->curg, 0, nil, 1000, &cb, &frame, 0);
-	if(frame.fn != nil) {
-		Func *f;
-		StackMap *stackmap;
-		BitVector bv;
-		uintptr size;
-		uintptr targetpc;
-		int32 pcdata;
-
-		f = frame.fn;
-		targetpc = frame.continpc;
-		if(targetpc == 0)
-			return;
-		if(targetpc != f->entry)
-			targetpc--;
-		pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
-		if(pcdata == -1)
-			return;
-		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
-		if(stackmap == nil || stackmap->n <= 0)
-			return;
-		bv = runtime·stackmapdata(stackmap, pcdata);
-		size = bv.n/BitsPerPointer*PtrSize;
-		n = ((PtrType*)t)->elem->size;
-		*len = n/PtrSize;
-		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
-		for(i = 0; i < n; i += PtrSize) {
-			off = (p+i-(byte*)frame.varp+size)/PtrSize;
-			bits = (bv.bytedata[off*BitsPerPointer/8] >> ((off*BitsPerPointer)%8))&BitsMask;
-			(*mask)[i/PtrSize] = bits;
-		}
-	}
-}
-
-void runtime·gc_unixnanotime(int64 *now);
-
-int64
-runtime·unixnanotime(void)
-{
-	int64 now;
-
-	runtime·gc_unixnanotime(&now);
-	return now;
-}

diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go
index 3a7204b..f7e01c8 100644
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go

@@ -60,10 +60,8 @@
 	}
 }
 
-func gosweepone() uintptr
-func gosweepdone() bool
-
 func bgsweep() {
+	sweep.g = getg()
 	getg().issystem = true
 	for {
 		for gosweepone() != ^uintptr(0) {

diff --git a/src/runtime/mgc0.h b/src/runtime/mgc1.go
similarity index 73%
rename from src/runtime/mgc0.h
rename to src/runtime/mgc1.go
index 64f8189..d1aab45 100644
--- a/src/runtime/mgc0.h
+++ b/src/runtime/mgc1.go

@@ -4,11 +4,15 @@
 
 // Garbage collector (GC)
 
-enum {
-	// Four bits per word (see #defines below).
-	gcBits = 4,
-	wordsPerBitmapByte = 8/gcBits,
+package runtime
 
+const (
+	// Four bits per word (see #defines below).
+	gcBits             = 4
+	wordsPerBitmapByte = 8 / gcBits
+)
+
+const (
 	// GC type info programs.
 	// The programs allow to store type info required for GC in a compact form.
 	// Most importantly arrays take O(1) space instead of O(n).
@@ -26,38 +30,33 @@
 	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
 	// the program looks as:
 	//
-	// insData 3 (BitsMultiWord BitsSlice BitsScalar)
+	// insData 3 (BitsPointer BitsScalar BitsScalar)
 	//	insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
 	//
 	// Total size of the program is 17 bytes (13 bytes on 32-bits).
 	// The corresponding GC mask would take 43 bytes (it would be repeated
 	// because the type has odd number of words).
-	insData = 1,
-	insArray,
-	insArrayEnd,
-	insEnd,
+	insData = 1 + iota
+	insArray
+	insArrayEnd
+	insEnd
+)
 
+const (
 	// Pointer map
-	BitsPerPointer	= 2,
-	BitsMask	= (1<<BitsPerPointer)-1,
-	PointersPerByte	= 8/BitsPerPointer,
+	_BitsPerPointer  = 2
+	_BitsMask        = (1 << _BitsPerPointer) - 1
+	_PointersPerByte = 8 / _BitsPerPointer
 
 	// If you change these, also change scanblock.
 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
-	BitsDead	= 0,
-	BitsScalar	= 1,
-	BitsPointer	= 2,
-	BitsMultiWord	= 3,
-	// BitsMultiWord will be set for the first word of a multi-word item.
-	// When it is set, one of the following will be set for the second word.
-	// NOT USED ANYMORE: BitsString	= 0,
-	// NOT USED ANYMORE: BitsSlice	= 1,
-	BitsIface	= 2,
-	BitsEface	= 3,
+	_BitsDead    = 0
+	_BitsScalar  = 1
+	_BitsPointer = 2
 
 	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
-	MaxGCMask	= 64,
-};
+	_MaxGCMask = 64
+)
 
 // Bits in per-word bitmap.
 // #defines because we shift the values beyond 32 bits.
@@ -70,9 +69,9 @@
 // there.  On a 64-bit system the off'th word in the arena is tracked by
 // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
 // the only difference is that the divisor is 8.)
-enum {
-	bitBoundary = 1, // boundary of an object
-	bitMarked = 2, // marked object
-	bitMask = bitBoundary | bitMarked,
-	bitPtrMask = BitsMask<<2,
-};
+const (
+	bitBoundary = 1 // boundary of an object
+	bitMarked   = 2 // marked object
+	bitMask     = bitBoundary | bitMarked
+	bitPtrMask  = _BitsMask << 2
+)

diff --git a/src/runtime/mheap.c b/src/runtime/mheap.c
deleted file mode 100644
index bb203d5..0000000
--- a/src/runtime/mheap.c
+++ /dev/null

@@ -1,889 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Page heap.
-//
-// See malloc.h for overview.
-//
-// When a MSpan is in the heap free list, state == MSpanFree
-// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
-//
-// When a MSpan is allocated, state == MSpanInUse or MSpanStack
-// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-
-static MSpan *MHeap_AllocSpanLocked(MHeap*, uintptr);
-static void MHeap_FreeSpanLocked(MHeap*, MSpan*, bool, bool);
-static bool MHeap_Grow(MHeap*, uintptr);
-static MSpan *MHeap_AllocLarge(MHeap*, uintptr);
-static MSpan *BestFit(MSpan*, uintptr, MSpan*);
-
-static void
-RecordSpan(void *vh, byte *p)
-{
-	MHeap *h;
-	MSpan *s;
-	MSpan **all;
-	uint32 cap;
-
-	h = vh;
-	s = (MSpan*)p;
-	if(h->nspan >= h->nspancap) {
-		cap = 64*1024/sizeof(all[0]);
-		if(cap < h->nspancap*3/2)
-			cap = h->nspancap*3/2;
-		all = (MSpan**)runtime·sysAlloc(cap*sizeof(all[0]), &mstats.other_sys);
-		if(all == nil)
-			runtime·throw("runtime: cannot allocate memory");
-		if(h->allspans) {
-			runtime·memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
-			// Don't free the old array if it's referenced by sweep.
-			// See the comment in mgc0.c.
-			if(h->allspans != runtime·mheap.gcspans)
-				runtime·SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
-		}
-		h->allspans = all;
-		h->nspancap = cap;
-	}
-	h->allspans[h->nspan++] = s;
-}
-
-// Initialize the heap; fetch memory using alloc.
-void
-runtime·MHeap_Init(MHeap *h)
-{
-	uint32 i;
-
-	runtime·FixAlloc_Init(&h->spanalloc, sizeof(MSpan), RecordSpan, h, &mstats.mspan_sys);
-	runtime·FixAlloc_Init(&h->cachealloc, sizeof(MCache), nil, nil, &mstats.mcache_sys);
-	runtime·FixAlloc_Init(&h->specialfinalizeralloc, sizeof(SpecialFinalizer), nil, nil, &mstats.other_sys);
-	runtime·FixAlloc_Init(&h->specialprofilealloc, sizeof(SpecialProfile), nil, nil, &mstats.other_sys);
-	// h->mapcache needs no init
-	for(i=0; i<nelem(h->free); i++) {
-		runtime·MSpanList_Init(&h->free[i]);
-		runtime·MSpanList_Init(&h->busy[i]);
-	}
-	runtime·MSpanList_Init(&h->freelarge);
-	runtime·MSpanList_Init(&h->busylarge);
-	for(i=0; i<nelem(h->central); i++)
-		runtime·MCentral_Init(&h->central[i].mcentral, i);
-}
-
-void
-runtime·MHeap_MapSpans(MHeap *h)
-{
-	uintptr n;
-
-	// Map spans array, PageSize at a time.
-	n = (uintptr)h->arena_used;
-	n -= (uintptr)h->arena_start;
-	n = n / PageSize * sizeof(h->spans[0]);
-	n = ROUND(n, PhysPageSize);
-	if(h->spans_mapped >= n)
-		return;
-	runtime·SysMap((byte*)h->spans + h->spans_mapped, n - h->spans_mapped, h->arena_reserved, &mstats.other_sys);
-	h->spans_mapped = n;
-}
-
-// Sweeps spans in list until reclaims at least npages into heap.
-// Returns the actual number of pages reclaimed.
-static uintptr
-MHeap_ReclaimList(MHeap *h, MSpan *list, uintptr npages)
-{
-	MSpan *s;
-	uintptr n;
-	uint32 sg;
-
-	n = 0;
-	sg = runtime·mheap.sweepgen;
-retry:
-	for(s = list->next; s != list; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
-			runtime·MSpanList_Remove(s);
-			// swept spans are at the end of the list
-			runtime·MSpanList_InsertBack(list, s);
-			runtime·unlock(&h->lock);
-			n += runtime·MSpan_Sweep(s, false);
-			runtime·lock(&h->lock);
-			if(n >= npages)
-				return n;
-			// the span could have been moved elsewhere
-			goto retry;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being sweept by background sweeper, skip
-			continue;
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break;
-	}
-	return n;
-}
-
-// Sweeps and reclaims at least npage pages into heap.
-// Called before allocating npage pages.
-static void
-MHeap_Reclaim(MHeap *h, uintptr npage)
-{
-	uintptr reclaimed, n;
-
-	// First try to sweep busy spans with large objects of size >= npage,
-	// this has good chances of reclaiming the necessary space.
-	for(n=npage; n < nelem(h->busy); n++) {
-		if(MHeap_ReclaimList(h, &h->busy[n], npage))
-			return;  // Bingo!
-	}
-
-	// Then -- even larger objects.
-	if(MHeap_ReclaimList(h, &h->busylarge, npage))
-		return;  // Bingo!
-
-	// Now try smaller objects.
-	// One such object is not enough, so we need to reclaim several of them.
-	reclaimed = 0;
-	for(n=0; n < npage && n < nelem(h->busy); n++) {
-		reclaimed += MHeap_ReclaimList(h, &h->busy[n], npage-reclaimed);
-		if(reclaimed >= npage)
-			return;
-	}
-
-	// Now sweep everything that is not yet swept.
-	runtime·unlock(&h->lock);
-	for(;;) {
-		n = runtime·sweepone();
-		if(n == -1)  // all spans are swept
-			break;
-		reclaimed += n;
-		if(reclaimed >= npage)
-			break;
-	}
-	runtime·lock(&h->lock);
-}
-
-// Allocate a new span of npage pages from the heap for GC'd memory
-// and record its size class in the HeapMap and HeapMapCache.
-static MSpan*
-mheap_alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large)
-{
-	MSpan *s;
-
-	if(g != g->m->g0)
-		runtime·throw("mheap_alloc not on M stack");
-	runtime·lock(&h->lock);
-
-	// To prevent excessive heap growth, before allocating n pages
-	// we need to sweep and reclaim at least n pages.
-	if(!h->sweepdone)
-		MHeap_Reclaim(h, npage);
-
-	// transfer stats from cache to global
-	mstats.heap_alloc += g->m->mcache->local_cachealloc;
-	g->m->mcache->local_cachealloc = 0;
-	mstats.tinyallocs += g->m->mcache->local_tinyallocs;
-	g->m->mcache->local_tinyallocs = 0;
-
-	s = MHeap_AllocSpanLocked(h, npage);
-	if(s != nil) {
-		// Record span info, because gc needs to be
-		// able to map interior pointer to containing span.
-		runtime·atomicstore(&s->sweepgen, h->sweepgen);
-		s->state = MSpanInUse;
-		s->freelist = nil;
-		s->ref = 0;
-		s->sizeclass = sizeclass;
-		s->elemsize = (sizeclass==0 ? s->npages<<PageShift : runtime·class_to_size[sizeclass]);
-
-		// update stats, sweep lists
-		if(large) {
-			mstats.heap_objects++;
-			mstats.heap_alloc += npage<<PageShift;
-			// Swept spans are at the end of lists.
-			if(s->npages < nelem(h->free))
-				runtime·MSpanList_InsertBack(&h->busy[s->npages], s);
-			else
-				runtime·MSpanList_InsertBack(&h->busylarge, s);
-		}
-	}
-	runtime·unlock(&h->lock);
-	return s;
-}
-
-static void
-mheap_alloc_m(G *gp)
-{
-	MHeap *h;
-	MSpan *s;
-
-	h = g->m->ptrarg[0];
-	g->m->ptrarg[0] = nil;
-	s = mheap_alloc(h, g->m->scalararg[0], g->m->scalararg[1], g->m->scalararg[2]);
-	g->m->ptrarg[0] = s;
-
-	runtime·gogo(&gp->sched);
-}
-
-MSpan*
-runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero)
-{
-	MSpan *s;
-	void (*fn)(G*);
-
-	// Don't do any operations that lock the heap on the G stack.
-	// It might trigger stack growth, and the stack growth code needs
-	// to be able to allocate heap.
-	if(g == g->m->g0) {
-		s = mheap_alloc(h, npage, sizeclass, large);
-	} else {
-		g->m->ptrarg[0] = h;
-		g->m->scalararg[0] = npage;
-		g->m->scalararg[1] = sizeclass;
-		g->m->scalararg[2] = large;
-		fn = mheap_alloc_m;
-		runtime·mcall(&fn);
-		s = g->m->ptrarg[0];
-		g->m->ptrarg[0] = nil;
-	}
-	if(s != nil) {
-		if(needzero && s->needzero)
-			runtime·memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		s->needzero = 0;
-	}
-	return s;
-}
-
-MSpan*
-runtime·MHeap_AllocStack(MHeap *h, uintptr npage)
-{
-	MSpan *s;
-
-	if(g != g->m->g0)
-		runtime·throw("mheap_allocstack not on M stack");
-	runtime·lock(&h->lock);
-	s = MHeap_AllocSpanLocked(h, npage);
-	if(s != nil) {
-		s->state = MSpanStack;
-		s->freelist = nil;
-		s->ref = 0;
-		mstats.stacks_inuse += s->npages<<PageShift;
-	}
-	runtime·unlock(&h->lock);
-	return s;
-}
-
-// Allocates a span of the given size.  h must be locked.
-// The returned span has been removed from the
-// free list, but its state is still MSpanFree.
-static MSpan*
-MHeap_AllocSpanLocked(MHeap *h, uintptr npage)
-{
-	uintptr n;
-	MSpan *s, *t;
-	pageID p;
-
-	// Try in fixed-size lists up to max.
-	for(n=npage; n < nelem(h->free); n++) {
-		if(!runtime·MSpanList_IsEmpty(&h->free[n])) {
-			s = h->free[n].next;
-			goto HaveSpan;
-		}
-	}
-
-	// Best fit in list of large spans.
-	if((s = MHeap_AllocLarge(h, npage)) == nil) {
-		if(!MHeap_Grow(h, npage))
-			return nil;
-		if((s = MHeap_AllocLarge(h, npage)) == nil)
-			return nil;
-	}
-
-HaveSpan:
-	// Mark span in use.
-	if(s->state != MSpanFree)
-		runtime·throw("MHeap_AllocLocked - MSpan not free");
-	if(s->npages < npage)
-		runtime·throw("MHeap_AllocLocked - bad npages");
-	runtime·MSpanList_Remove(s);
-	if(s->next != nil || s->prev != nil)
-		runtime·throw("still in list");
-	if(s->npreleased > 0) {
-		runtime·SysUsed((void*)(s->start<<PageShift), s->npages<<PageShift);
-		mstats.heap_released -= s->npreleased<<PageShift;
-		s->npreleased = 0;
-	}
-
-	if(s->npages > npage) {
-		// Trim extra and put it back in the heap.
-		t = runtime·FixAlloc_Alloc(&h->spanalloc);
-		runtime·MSpan_Init(t, s->start + npage, s->npages - npage);
-		s->npages = npage;
-		p = t->start;
-		p -= ((uintptr)h->arena_start>>PageShift);
-		if(p > 0)
-			h->spans[p-1] = s;
-		h->spans[p] = t;
-		h->spans[p+t->npages-1] = t;
-		t->needzero = s->needzero;
-		s->state = MSpanStack; // prevent coalescing with s
-		t->state = MSpanStack;
-		MHeap_FreeSpanLocked(h, t, false, false);
-		t->unusedsince = s->unusedsince; // preserve age (TODO: wrong: t is possibly merged and/or deallocated at this point)
-		s->state = MSpanFree;
-	}
-	s->unusedsince = 0;
-
-	p = s->start;
-	p -= ((uintptr)h->arena_start>>PageShift);
-	for(n=0; n<npage; n++)
-		h->spans[p+n] = s;
-
-	mstats.heap_inuse += npage<<PageShift;
-	mstats.heap_idle -= npage<<PageShift;
-
-	//runtime·printf("spanalloc %p\n", s->start << PageShift);
-	if(s->next != nil || s->prev != nil)
-		runtime·throw("still in list");
-	return s;
-}
-
-// Allocate a span of exactly npage pages from the list of large spans.
-static MSpan*
-MHeap_AllocLarge(MHeap *h, uintptr npage)
-{
-	return BestFit(&h->freelarge, npage, nil);
-}
-
-// Search list for smallest span with >= npage pages.
-// If there are multiple smallest spans, take the one
-// with the earliest starting address.
-static MSpan*
-BestFit(MSpan *list, uintptr npage, MSpan *best)
-{
-	MSpan *s;
-
-	for(s=list->next; s != list; s=s->next) {
-		if(s->npages < npage)
-			continue;
-		if(best == nil
-		|| s->npages < best->npages
-		|| (s->npages == best->npages && s->start < best->start))
-			best = s;
-	}
-	return best;
-}
-
-// Try to add at least npage pages of memory to the heap,
-// returning whether it worked.
-static bool
-MHeap_Grow(MHeap *h, uintptr npage)
-{
-	uintptr ask;
-	void *v;
-	MSpan *s;
-	pageID p;
-
-	// Ask for a big chunk, to reduce the number of mappings
-	// the operating system needs to track; also amortizes
-	// the overhead of an operating system mapping.
-	// Allocate a multiple of 64kB.
-	npage = ROUND(npage, (64<<10)/PageSize);
-	ask = npage<<PageShift;
-	if(ask < HeapAllocChunk)
-		ask = HeapAllocChunk;
-
-	v = runtime·MHeap_SysAlloc(h, ask);
-	if(v == nil) {
-		if(ask > (npage<<PageShift)) {
-			ask = npage<<PageShift;
-			v = runtime·MHeap_SysAlloc(h, ask);
-		}
-		if(v == nil) {
-			runtime·printf("runtime: out of memory: cannot allocate %D-byte block (%D in use)\n", (uint64)ask, mstats.heap_sys);
-			return false;
-		}
-	}
-
-	// Create a fake "in use" span and free it, so that the
-	// right coalescing happens.
-	s = runtime·FixAlloc_Alloc(&h->spanalloc);
-	runtime·MSpan_Init(s, (uintptr)v>>PageShift, ask>>PageShift);
-	p = s->start;
-	p -= ((uintptr)h->arena_start>>PageShift);
-	h->spans[p] = s;
-	h->spans[p + s->npages - 1] = s;
-	runtime·atomicstore(&s->sweepgen, h->sweepgen);
-	s->state = MSpanInUse;
-	MHeap_FreeSpanLocked(h, s, false, true);
-	return true;
-}
-
-// Look up the span at the given address.
-// Address is guaranteed to be in map
-// and is guaranteed to be start or end of span.
-MSpan*
-runtime·MHeap_Lookup(MHeap *h, void *v)
-{
-	uintptr p;
-	
-	p = (uintptr)v;
-	p -= (uintptr)h->arena_start;
-	return h->spans[p >> PageShift];
-}
-
-// Look up the span at the given address.
-// Address is *not* guaranteed to be in map
-// and may be anywhere in the span.
-// Map entries for the middle of a span are only
-// valid for allocated spans.  Free spans may have
-// other garbage in their middles, so we have to
-// check for that.
-MSpan*
-runtime·MHeap_LookupMaybe(MHeap *h, void *v)
-{
-	MSpan *s;
-	pageID p, q;
-
-	if((byte*)v < h->arena_start || (byte*)v >= h->arena_used)
-		return nil;
-	p = (uintptr)v>>PageShift;
-	q = p;
-	q -= (uintptr)h->arena_start >> PageShift;
-	s = h->spans[q];
-	if(s == nil || p < s->start || v >= s->limit || s->state != MSpanInUse)
-		return nil;
-	return s;
-}
-
-// Free the span back into the heap.
-static void
-mheap_free(MHeap *h, MSpan *s, int32 acct)
-{
-	if(g != g->m->g0)
-		runtime·throw("mheap_free not on M stack");
-	runtime·lock(&h->lock);
-	mstats.heap_alloc += g->m->mcache->local_cachealloc;
-	g->m->mcache->local_cachealloc = 0;
-	mstats.tinyallocs += g->m->mcache->local_tinyallocs;
-	g->m->mcache->local_tinyallocs = 0;
-	if(acct) {
-		mstats.heap_alloc -= s->npages<<PageShift;
-		mstats.heap_objects--;
-	}
-	MHeap_FreeSpanLocked(h, s, true, true);
-	runtime·unlock(&h->lock);
-}
-
-static void
-mheap_free_m(G *gp)
-{
-	MHeap *h;
-	MSpan *s;
-	
-	h = g->m->ptrarg[0];
-	s = g->m->ptrarg[1];
-	g->m->ptrarg[0] = nil;
-	g->m->ptrarg[1] = nil;
-	mheap_free(h, s, g->m->scalararg[0]);
-	runtime·gogo(&gp->sched);
-}
-
-void
-runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct)
-{
-	void (*fn)(G*);
-
-	if(g == g->m->g0) {
-		mheap_free(h, s, acct);
-	} else {
-		g->m->ptrarg[0] = h;
-		g->m->ptrarg[1] = s;
-		g->m->scalararg[0] = acct;
-		fn = mheap_free_m;
-		runtime·mcall(&fn);
-	}
-}
-
-void
-runtime·MHeap_FreeStack(MHeap *h, MSpan *s)
-{
-	if(g != g->m->g0)
-		runtime·throw("mheap_freestack not on M stack");
-	s->needzero = 1;
-	runtime·lock(&h->lock);
-	mstats.stacks_inuse -= s->npages<<PageShift;
-	MHeap_FreeSpanLocked(h, s, true, true);
-	runtime·unlock(&h->lock);
-}
-
-static void
-MHeap_FreeSpanLocked(MHeap *h, MSpan *s, bool acctinuse, bool acctidle)
-{
-	MSpan *t;
-	pageID p;
-
-	switch(s->state) {
-	case MSpanStack:
-		if(s->ref != 0)
-			runtime·throw("MHeap_FreeSpanLocked - invalid stack free");
-		break;
-	case MSpanInUse:
-		if(s->ref != 0 || s->sweepgen != h->sweepgen) {
-			runtime·printf("MHeap_FreeSpanLocked - span %p ptr %p ref %d sweepgen %d/%d\n",
-				       s, s->start<<PageShift, s->ref, s->sweepgen, h->sweepgen);
-			runtime·throw("MHeap_FreeSpanLocked - invalid free");
-		}
-		break;
-	default:
-		runtime·throw("MHeap_FreeSpanLocked - invalid span state");
-		break;
-	}
-	if(acctinuse)
-		mstats.heap_inuse -= s->npages<<PageShift;
-	if(acctidle)
-		mstats.heap_idle += s->npages<<PageShift;
-	s->state = MSpanFree;
-	runtime·MSpanList_Remove(s);
-	// Stamp newly unused spans. The scavenger will use that
-	// info to potentially give back some pages to the OS.
-	s->unusedsince = runtime·nanotime();
-	s->npreleased = 0;
-
-	// Coalesce with earlier, later spans.
-	p = s->start;
-	p -= (uintptr)h->arena_start >> PageShift;
-	if(p > 0 && (t = h->spans[p-1]) != nil && t->state != MSpanInUse && t->state != MSpanStack) {
-		s->start = t->start;
-		s->npages += t->npages;
-		s->npreleased = t->npreleased; // absorb released pages
-		s->needzero |= t->needzero;
-		p -= t->npages;
-		h->spans[p] = s;
-		runtime·MSpanList_Remove(t);
-		t->state = MSpanDead;
-		runtime·FixAlloc_Free(&h->spanalloc, t);
-	}
-	if((p+s->npages)*sizeof(h->spans[0]) < h->spans_mapped && (t = h->spans[p+s->npages]) != nil && t->state != MSpanInUse && t->state != MSpanStack) {
-		s->npages += t->npages;
-		s->npreleased += t->npreleased;
-		s->needzero |= t->needzero;
-		h->spans[p + s->npages - 1] = s;
-		runtime·MSpanList_Remove(t);
-		t->state = MSpanDead;
-		runtime·FixAlloc_Free(&h->spanalloc, t);
-	}
-
-	// Insert s into appropriate list.
-	if(s->npages < nelem(h->free))
-		runtime·MSpanList_Insert(&h->free[s->npages], s);
-	else
-		runtime·MSpanList_Insert(&h->freelarge, s);
-}
-
-static uintptr
-scavengelist(MSpan *list, uint64 now, uint64 limit)
-{
-	uintptr released, sumreleased;
-	MSpan *s;
-
-	if(runtime·MSpanList_IsEmpty(list))
-		return 0;
-
-	sumreleased = 0;
-	for(s=list->next; s != list; s=s->next) {
-		if((now - s->unusedsince) > limit && s->npreleased != s->npages) {
-			released = (s->npages - s->npreleased) << PageShift;
-			mstats.heap_released += released;
-			sumreleased += released;
-			s->npreleased = s->npages;
-			runtime·SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
-		}
-	}
-	return sumreleased;
-}
-
-void
-runtime·MHeap_Scavenge(int32 k, uint64 now, uint64 limit)
-{
-	uint32 i;
-	uintptr sumreleased;
-	MHeap *h;
-	
-	h = &runtime·mheap;
-	runtime·lock(&h->lock);
-	sumreleased = 0;
-	for(i=0; i < nelem(h->free); i++)
-		sumreleased += scavengelist(&h->free[i], now, limit);
-	sumreleased += scavengelist(&h->freelarge, now, limit);
-	runtime·unlock(&h->lock);
-
-	if(runtime·debug.gctrace > 0) {
-		if(sumreleased > 0)
-			runtime·printf("scvg%d: %D MB released\n", k, (uint64)sumreleased>>20);
-		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
-		// But we can't call ReadMemStats on g0 holding locks.
-		runtime·printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
-			k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
-			mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
-	}
-}
-
-void
-runtime·scavenge_m(void)
-{
-	runtime·MHeap_Scavenge(-1, ~(uintptr)0, 0);
-}
-
-// Initialize a new span with the given start and npages.
-void
-runtime·MSpan_Init(MSpan *span, pageID start, uintptr npages)
-{
-	span->next = nil;
-	span->prev = nil;
-	span->start = start;
-	span->npages = npages;
-	span->freelist = nil;
-	span->ref = 0;
-	span->sizeclass = 0;
-	span->incache = false;
-	span->elemsize = 0;
-	span->state = MSpanDead;
-	span->unusedsince = 0;
-	span->npreleased = 0;
-	span->specialLock.key = 0;
-	span->specials = nil;
-	span->needzero = 0;
-}
-
-// Initialize an empty doubly-linked list.
-void
-runtime·MSpanList_Init(MSpan *list)
-{
-	list->state = MSpanListHead;
-	list->next = list;
-	list->prev = list;
-}
-
-void
-runtime·MSpanList_Remove(MSpan *span)
-{
-	if(span->prev == nil && span->next == nil)
-		return;
-	span->prev->next = span->next;
-	span->next->prev = span->prev;
-	span->prev = nil;
-	span->next = nil;
-}
-
-bool
-runtime·MSpanList_IsEmpty(MSpan *list)
-{
-	return list->next == list;
-}
-
-void
-runtime·MSpanList_Insert(MSpan *list, MSpan *span)
-{
-	if(span->next != nil || span->prev != nil) {
-		runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
-		runtime·throw("MSpanList_Insert");
-	}
-	span->next = list->next;
-	span->prev = list;
-	span->next->prev = span;
-	span->prev->next = span;
-}
-
-void
-runtime·MSpanList_InsertBack(MSpan *list, MSpan *span)
-{
-	if(span->next != nil || span->prev != nil) {
-		runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
-		runtime·throw("MSpanList_Insert");
-	}
-	span->next = list;
-	span->prev = list->prev;
-	span->next->prev = span;
-	span->prev->next = span;
-}
-
-// Adds the special record s to the list of special records for
-// the object p.  All fields of s should be filled in except for
-// offset & next, which this routine will fill in.
-// Returns true if the special was successfully added, false otherwise.
-// (The add will fail only if a record with the same p and s->kind
-//  already exists.)
-static bool
-addspecial(void *p, Special *s)
-{
-	MSpan *span;
-	Special **t, *x;
-	uintptr offset;
-	byte kind;
-
-	span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
-	if(span == nil)
-		runtime·throw("addspecial on invalid pointer");
-
-	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
-	g->m->locks++;
-	runtime·MSpan_EnsureSwept(span);
-
-	offset = (uintptr)p - (span->start << PageShift);
-	kind = s->kind;
-
-	runtime·lock(&span->specialLock);
-
-	// Find splice point, check for existing record.
-	t = &span->specials;
-	while((x = *t) != nil) {
-		if(offset == x->offset && kind == x->kind) {
-			runtime·unlock(&span->specialLock);
-			g->m->locks--;
-			return false; // already exists
-		}
-		if(offset < x->offset || (offset == x->offset && kind < x->kind))
-			break;
-		t = &x->next;
-	}
-	// Splice in record, fill in offset.
-	s->offset = offset;
-	s->next = x;
-	*t = s;
-	runtime·unlock(&span->specialLock);
-	g->m->locks--;
-	return true;
-}
-
-// Removes the Special record of the given kind for the object p.
-// Returns the record if the record existed, nil otherwise.
-// The caller must FixAlloc_Free the result.
-static Special*
-removespecial(void *p, byte kind)
-{
-	MSpan *span;
-	Special *s, **t;
-	uintptr offset;
-
-	span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
-	if(span == nil)
-		runtime·throw("removespecial on invalid pointer");
-
-	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
-	g->m->locks++;
-	runtime·MSpan_EnsureSwept(span);
-
-	offset = (uintptr)p - (span->start << PageShift);
-
-	runtime·lock(&span->specialLock);
-	t = &span->specials;
-	while((s = *t) != nil) {
-		// This function is used for finalizers only, so we don't check for
-		// "interior" specials (p must be exactly equal to s->offset).
-		if(offset == s->offset && kind == s->kind) {
-			*t = s->next;
-			runtime·unlock(&span->specialLock);
-			g->m->locks--;
-			return s;
-		}
-		t = &s->next;
-	}
-	runtime·unlock(&span->specialLock);
-	g->m->locks--;
-	return nil;
-}
-
-// Adds a finalizer to the object p.  Returns true if it succeeded.
-bool
-runtime·addfinalizer(void *p, FuncVal *f, uintptr nret, Type *fint, PtrType *ot)
-{
-	SpecialFinalizer *s;
-
-	runtime·lock(&runtime·mheap.speciallock);
-	s = runtime·FixAlloc_Alloc(&runtime·mheap.specialfinalizeralloc);
-	runtime·unlock(&runtime·mheap.speciallock);
-	s->special.kind = KindSpecialFinalizer;
-	s->fn = f;
-	s->nret = nret;
-	s->fint = fint;
-	s->ot = ot;
-	if(addspecial(p, &s->special))
-		return true;
-
-	// There was an old finalizer
-	runtime·lock(&runtime·mheap.speciallock);
-	runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, s);
-	runtime·unlock(&runtime·mheap.speciallock);
-	return false;
-}
-
-// Removes the finalizer (if any) from the object p.
-void
-runtime·removefinalizer(void *p)
-{
-	SpecialFinalizer *s;
-
-	s = (SpecialFinalizer*)removespecial(p, KindSpecialFinalizer);
-	if(s == nil)
-		return; // there wasn't a finalizer to remove
-	runtime·lock(&runtime·mheap.speciallock);
-	runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, s);
-	runtime·unlock(&runtime·mheap.speciallock);
-}
-
-// Set the heap profile bucket associated with addr to b.
-void
-runtime·setprofilebucket_m(void)
-{	
-	void *p;
-	Bucket *b;
-	SpecialProfile *s;
-	
-	p = g->m->ptrarg[0];
-	b = g->m->ptrarg[1];
-	g->m->ptrarg[0] = nil;
-	g->m->ptrarg[1] = nil;
-
-	runtime·lock(&runtime·mheap.speciallock);
-	s = runtime·FixAlloc_Alloc(&runtime·mheap.specialprofilealloc);
-	runtime·unlock(&runtime·mheap.speciallock);
-	s->special.kind = KindSpecialProfile;
-	s->b = b;
-	if(!addspecial(p, &s->special))
-		runtime·throw("setprofilebucket: profile already set");
-}
-
-// Do whatever cleanup needs to be done to deallocate s.  It has
-// already been unlinked from the MSpan specials list.
-// Returns true if we should keep working on deallocating p.
-bool
-runtime·freespecial(Special *s, void *p, uintptr size, bool freed)
-{
-	SpecialFinalizer *sf;
-	SpecialProfile *sp;
-
-	switch(s->kind) {
-	case KindSpecialFinalizer:
-		sf = (SpecialFinalizer*)s;
-		runtime·queuefinalizer(p, sf->fn, sf->nret, sf->fint, sf->ot);
-		runtime·lock(&runtime·mheap.speciallock);
-		runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, sf);
-		runtime·unlock(&runtime·mheap.speciallock);
-		return false; // don't free p until finalizer is done
-	case KindSpecialProfile:
-		sp = (SpecialProfile*)s;
-		runtime·mProf_Free(sp->b, size, freed);
-		runtime·lock(&runtime·mheap.speciallock);
-		runtime·FixAlloc_Free(&runtime·mheap.specialprofilealloc, sp);
-		runtime·unlock(&runtime·mheap.speciallock);
-		return true;
-	default:
-		runtime·throw("bad special kind");
-		return true;
-	}
-}

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
new file mode 100644
index 0000000..b451b63
--- /dev/null
+++ b/src/runtime/mheap.go

@@ -0,0 +1,785 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.h for overview.
+//
+// When a MSpan is in the heap free list, state == MSpanFree
+// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+//
+// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
+
+package runtime
+
+import "unsafe"
+
+var h_allspans []*mspan // TODO: make this h.allspans once mheap can be defined in Go
+var h_spans []*mspan    // TODO: make this h.spans once mheap can be defined in Go
+
+func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
+	h := (*mheap)(vh)
+	s := (*mspan)(p)
+	if len(h_allspans) >= cap(h_allspans) {
+		n := 64 * 1024 / ptrSize
+		if n < cap(h_allspans)*3/2 {
+			n = cap(h_allspans) * 3 / 2
+		}
+		var new []*mspan
+		sp := (*slice)(unsafe.Pointer(&new))
+		sp.array = (*byte)(sysAlloc(uintptr(n)*ptrSize, &memstats.other_sys))
+		if sp.array == nil {
+			gothrow("runtime: cannot allocate memory")
+		}
+		sp.len = uint(len(h_allspans))
+		sp.cap = uint(n)
+		if len(h_allspans) > 0 {
+			copy(new, h_allspans)
+			// Don't free the old array if it's referenced by sweep.
+			// See the comment in mgc0.c.
+			if h.allspans != mheap_.gcspans {
+				sysFree(unsafe.Pointer(h.allspans), uintptr(cap(h_allspans))*ptrSize, &memstats.other_sys)
+			}
+		}
+		h_allspans = new
+		h.allspans = (**mspan)(unsafe.Pointer(sp.array))
+	}
+	h_allspans = append(h_allspans, s)
+	h.nspan = uint32(len(h_allspans))
+}
+
+// Initialize the heap.
+func mHeap_Init(h *mheap, spans_size uintptr) {
+	fixAlloc_Init(&h.spanalloc, unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
+	fixAlloc_Init(&h.cachealloc, unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+	fixAlloc_Init(&h.specialfinalizeralloc, unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
+	fixAlloc_Init(&h.specialprofilealloc, unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+
+	// h->mapcache needs no init
+	for i := range h.free {
+		mSpanList_Init(&h.free[i])
+		mSpanList_Init(&h.busy[i])
+	}
+
+	mSpanList_Init(&h.freelarge)
+	mSpanList_Init(&h.busylarge)
+	for i := range h.central {
+		mCentral_Init(&h.central[i].mcentral, int32(i))
+	}
+
+	sp := (*slice)(unsafe.Pointer(&h_spans))
+	sp.array = (*byte)(unsafe.Pointer(h.spans))
+	sp.len = uint(spans_size / ptrSize)
+	sp.cap = uint(spans_size / ptrSize)
+}
+
+func mHeap_MapSpans(h *mheap) {
+	// Map spans array, PageSize at a time.
+	n := uintptr(unsafe.Pointer(h.arena_used))
+	n -= uintptr(unsafe.Pointer(h.arena_start))
+	n = n / _PageSize * ptrSize
+	n = round(n, _PhysPageSize)
+	if h.spans_mapped >= n {
+		return
+	}
+	sysMap(add(unsafe.Pointer(h.spans), h.spans_mapped), n-h.spans_mapped, h.arena_reserved, &memstats.other_sys)
+	h.spans_mapped = n
+}
+
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+func mHeap_ReclaimList(h *mheap, list *mspan, npages uintptr) uintptr {
+	n := uintptr(0)
+	sg := mheap_.sweepgen
+retry:
+	for s := list.next; s != list; s = s.next {
+		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+			mSpanList_Remove(s)
+			// swept spans are at the end of the list
+			mSpanList_InsertBack(list, s)
+			unlock(&h.lock)
+			if mSpan_Sweep(s, false) {
+				// TODO(rsc,dvyukov): This is probably wrong.
+				// It is undercounting the number of pages reclaimed.
+				// See golang.org/issue/9048.
+				// Note that if we want to add the true count of s's pages,
+				// we must record that before calling mSpan_Sweep,
+				// because if mSpan_Sweep returns true the span has
+				// been
+				n++
+			}
+			lock(&h.lock)
+			if n >= npages {
+				return n
+			}
+			// the span could have been moved elsewhere
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being sweept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	return n
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+func mHeap_Reclaim(h *mheap, npage uintptr) {
+	// First try to sweep busy spans with large objects of size >= npage,
+	// this has good chances of reclaiming the necessary space.
+	for i := int(npage); i < len(h.busy); i++ {
+		if mHeap_ReclaimList(h, &h.busy[i], npage) != 0 {
+			return // Bingo!
+		}
+	}
+
+	// Then -- even larger objects.
+	if mHeap_ReclaimList(h, &h.busylarge, npage) != 0 {
+		return // Bingo!
+	}
+
+	// Now try smaller objects.
+	// One such object is not enough, so we need to reclaim several of them.
+	reclaimed := uintptr(0)
+	for i := 0; i < int(npage) && i < len(h.busy); i++ {
+		reclaimed += mHeap_ReclaimList(h, &h.busy[i], npage-reclaimed)
+		if reclaimed >= npage {
+			return
+		}
+	}
+
+	// Now sweep everything that is not yet swept.
+	unlock(&h.lock)
+	for {
+		n := sweepone()
+		if n == ^uintptr(0) { // all spans are swept
+			break
+		}
+		reclaimed += n
+		if reclaimed >= npage {
+			break
+		}
+	}
+	lock(&h.lock)
+}
+
+// Allocate a new span of npage pages from the heap for GC'd memory
+// and record its size class in the HeapMap and HeapMapCache.
+func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		gothrow("_mheap_alloc not on M stack")
+	}
+	lock(&h.lock)
+
+	// To prevent excessive heap growth, before allocating n pages
+	// we need to sweep and reclaim at least n pages.
+	if h.sweepdone == 0 {
+		mHeap_Reclaim(h, npage)
+	}
+
+	// transfer stats from cache to global
+	memstats.heap_alloc += uint64(_g_.m.mcache.local_cachealloc)
+	_g_.m.mcache.local_cachealloc = 0
+	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
+	_g_.m.mcache.local_tinyallocs = 0
+
+	s := mHeap_AllocSpanLocked(h, npage)
+	if s != nil {
+		// Record span info, because gc needs to be
+		// able to map interior pointer to containing span.
+		atomicstore(&s.sweepgen, h.sweepgen)
+		s.state = _MSpanInUse
+		s.freelist = nil
+		s.ref = 0
+		s.sizeclass = uint8(sizeclass)
+		if sizeclass == 0 {
+			s.elemsize = s.npages << _PageShift
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+		}
+
+		// update stats, sweep lists
+		if large {
+			memstats.heap_objects++
+			memstats.heap_alloc += uint64(npage << _PageShift)
+			// Swept spans are at the end of lists.
+			if s.npages < uintptr(len(h.free)) {
+				mSpanList_InsertBack(&h.busy[s.npages], s)
+			} else {
+				mSpanList_InsertBack(&h.busylarge, s)
+			}
+		}
+	}
+	unlock(&h.lock)
+	return s
+}
+
+func mHeap_Alloc(h *mheap, npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+	// Don't do any operations that lock the heap on the G stack.
+	// It might trigger stack growth, and the stack growth code needs
+	// to be able to allocate heap.
+	var s *mspan
+	onM(func() {
+		s = mHeap_Alloc_m(h, npage, sizeclass, large)
+	})
+
+	if s != nil {
+		if needzero && s.needzero != 0 {
+			memclr(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift)
+		}
+		s.needzero = 0
+	}
+	return s
+}
+
+func mHeap_AllocStack(h *mheap, npage uintptr) *mspan {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		gothrow("mheap_allocstack not on M stack")
+	}
+	lock(&h.lock)
+	s := mHeap_AllocSpanLocked(h, npage)
+	if s != nil {
+		s.state = _MSpanStack
+		s.freelist = nil
+		s.ref = 0
+		memstats.stacks_inuse += uint64(s.npages << _PageShift)
+	}
+	unlock(&h.lock)
+	return s
+}
+
+// Allocates a span of the given size.  h must be locked.
+// The returned span has been removed from the
+// free list, but its state is still MSpanFree.
+func mHeap_AllocSpanLocked(h *mheap, npage uintptr) *mspan {
+	var s *mspan
+
+	// Try in fixed-size lists up to max.
+	for i := int(npage); i < len(h.free); i++ {
+		if !mSpanList_IsEmpty(&h.free[i]) {
+			s = h.free[i].next
+			goto HaveSpan
+		}
+	}
+
+	// Best fit in list of large spans.
+	s = mHeap_AllocLarge(h, npage)
+	if s == nil {
+		if !mHeap_Grow(h, npage) {
+			return nil
+		}
+		s = mHeap_AllocLarge(h, npage)
+		if s == nil {
+			return nil
+		}
+	}
+
+HaveSpan:
+	// Mark span in use.
+	if s.state != _MSpanFree {
+		gothrow("MHeap_AllocLocked - MSpan not free")
+	}
+	if s.npages < npage {
+		gothrow("MHeap_AllocLocked - bad npages")
+	}
+	mSpanList_Remove(s)
+	if s.next != nil || s.prev != nil {
+		gothrow("still in list")
+	}
+	if s.npreleased > 0 {
+		sysUsed((unsafe.Pointer)(s.start<<_PageShift), s.npages<<_PageShift)
+		memstats.heap_released -= uint64(s.npreleased << _PageShift)
+		s.npreleased = 0
+	}
+
+	if s.npages > npage {
+		// Trim extra and put it back in the heap.
+		t := (*mspan)(fixAlloc_Alloc(&h.spanalloc))
+		mSpan_Init(t, s.start+pageID(npage), s.npages-npage)
+		s.npages = npage
+		p := uintptr(t.start)
+		p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+		if p > 0 {
+			h_spans[p-1] = s
+		}
+		h_spans[p] = t
+		h_spans[p+t.npages-1] = t
+		t.needzero = s.needzero
+		s.state = _MSpanStack // prevent coalescing with s
+		t.state = _MSpanStack
+		mHeap_FreeSpanLocked(h, t, false, false)
+		t.unusedsince = s.unusedsince // preserve age (TODO: wrong: t is possibly merged and/or deallocated at this point)
+		s.state = _MSpanFree
+	}
+	s.unusedsince = 0
+
+	p := uintptr(s.start)
+	p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+	for n := uintptr(0); n < npage; n++ {
+		h_spans[p+n] = s
+	}
+
+	memstats.heap_inuse += uint64(npage << _PageShift)
+	memstats.heap_idle -= uint64(npage << _PageShift)
+
+	//println("spanalloc", hex(s.start<<_PageShift))
+	if s.next != nil || s.prev != nil {
+		gothrow("still in list")
+	}
+	return s
+}
+
+// Allocate a span of exactly npage pages from the list of large spans.
+func mHeap_AllocLarge(h *mheap, npage uintptr) *mspan {
+	return bestFit(&h.freelarge, npage, nil)
+}
+
+// Search list for smallest span with >= npage pages.
+// If there are multiple smallest spans, take the one
+// with the earliest starting address.
+func bestFit(list *mspan, npage uintptr, best *mspan) *mspan {
+	for s := list.next; s != list; s = s.next {
+		if s.npages < npage {
+			continue
+		}
+		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.start < best.start) {
+			best = s
+		}
+	}
+	return best
+}
+
+// Try to add at least npage pages of memory to the heap,
+// returning whether it worked.
+func mHeap_Grow(h *mheap, npage uintptr) bool {
+	// Ask for a big chunk, to reduce the number of mappings
+	// the operating system needs to track; also amortizes
+	// the overhead of an operating system mapping.
+	// Allocate a multiple of 64kB.
+	npage = round(npage, (64<<10)/_PageSize)
+	ask := npage << _PageShift
+	if ask < _HeapAllocChunk {
+		ask = _HeapAllocChunk
+	}
+
+	v := mHeap_SysAlloc(h, ask)
+	if v == nil {
+		if ask > npage<<_PageShift {
+			ask = npage << _PageShift
+			v = mHeap_SysAlloc(h, ask)
+		}
+		if v == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return false
+		}
+	}
+
+	// Create a fake "in use" span and free it, so that the
+	// right coalescing happens.
+	s := (*mspan)(fixAlloc_Alloc(&h.spanalloc))
+	mSpan_Init(s, pageID(uintptr(v)>>_PageShift), ask>>_PageShift)
+	p := uintptr(s.start)
+	p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+	h_spans[p] = s
+	h_spans[p+s.npages-1] = s
+	atomicstore(&s.sweepgen, h.sweepgen)
+	s.state = _MSpanInUse
+	mHeap_FreeSpanLocked(h, s, false, true)
+	return true
+}
+
+// Look up the span at the given address.
+// Address is guaranteed to be in map
+// and is guaranteed to be start or end of span.
+func mHeap_Lookup(h *mheap, v unsafe.Pointer) *mspan {
+	p := uintptr(v)
+	p -= uintptr(unsafe.Pointer(h.arena_start))
+	return h_spans[p>>_PageShift]
+}
+
+// Look up the span at the given address.
+// Address is *not* guaranteed to be in map
+// and may be anywhere in the span.
+// Map entries for the middle of a span are only
+// valid for allocated spans.  Free spans may have
+// other garbage in their middles, so we have to
+// check for that.
+func mHeap_LookupMaybe(h *mheap, v unsafe.Pointer) *mspan {
+	if uintptr(v) < uintptr(unsafe.Pointer(h.arena_start)) || uintptr(v) >= uintptr(unsafe.Pointer(h.arena_used)) {
+		return nil
+	}
+	p := uintptr(v) >> _PageShift
+	q := p
+	q -= uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift
+	s := h_spans[q]
+	if s == nil || p < uintptr(s.start) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse {
+		return nil
+	}
+	return s
+}
+
+// Free the span back into the heap.
+func mHeap_Free(h *mheap, s *mspan, acct int32) {
+	onM(func() {
+		mp := getg().m
+		lock(&h.lock)
+		memstats.heap_alloc += uint64(mp.mcache.local_cachealloc)
+		mp.mcache.local_cachealloc = 0
+		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
+		mp.mcache.local_tinyallocs = 0
+		if acct != 0 {
+			memstats.heap_alloc -= uint64(s.npages << _PageShift)
+			memstats.heap_objects--
+		}
+		mHeap_FreeSpanLocked(h, s, true, true)
+		unlock(&h.lock)
+	})
+}
+
+func mHeap_FreeStack(h *mheap, s *mspan) {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		gothrow("mheap_freestack not on M stack")
+	}
+	s.needzero = 1
+	lock(&h.lock)
+	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
+	mHeap_FreeSpanLocked(h, s, true, true)
+	unlock(&h.lock)
+}
+
+func mHeap_FreeSpanLocked(h *mheap, s *mspan, acctinuse, acctidle bool) {
+	switch s.state {
+	case _MSpanStack:
+		if s.ref != 0 {
+			gothrow("MHeap_FreeSpanLocked - invalid stack free")
+		}
+	case _MSpanInUse:
+		if s.ref != 0 || s.sweepgen != h.sweepgen {
+			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " ref ", s.ref, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+			gothrow("MHeap_FreeSpanLocked - invalid free")
+		}
+	default:
+		gothrow("MHeap_FreeSpanLocked - invalid span state")
+	}
+
+	if acctinuse {
+		memstats.heap_inuse -= uint64(s.npages << _PageShift)
+	}
+	if acctidle {
+		memstats.heap_idle += uint64(s.npages << _PageShift)
+	}
+	s.state = _MSpanFree
+	mSpanList_Remove(s)
+
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	s.unusedsince = nanotime()
+	s.npreleased = 0
+
+	// Coalesce with earlier, later spans.
+	p := uintptr(s.start)
+	p -= uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift
+	if p > 0 {
+		t := h_spans[p-1]
+		if t != nil && t.state != _MSpanInUse && t.state != _MSpanStack {
+			s.start = t.start
+			s.npages += t.npages
+			s.npreleased = t.npreleased // absorb released pages
+			s.needzero |= t.needzero
+			p -= t.npages
+			h_spans[p] = s
+			mSpanList_Remove(t)
+			t.state = _MSpanDead
+			fixAlloc_Free(&h.spanalloc, (unsafe.Pointer)(t))
+		}
+	}
+	if (p+s.npages)*ptrSize < h.spans_mapped {
+		t := h_spans[p+s.npages]
+		if t != nil && t.state != _MSpanInUse && t.state != _MSpanStack {
+			s.npages += t.npages
+			s.npreleased += t.npreleased
+			s.needzero |= t.needzero
+			h_spans[p+s.npages-1] = s
+			mSpanList_Remove(t)
+			t.state = _MSpanDead
+			fixAlloc_Free(&h.spanalloc, (unsafe.Pointer)(t))
+		}
+	}
+
+	// Insert s into appropriate list.
+	if s.npages < uintptr(len(h.free)) {
+		mSpanList_Insert(&h.free[s.npages], s)
+	} else {
+		mSpanList_Insert(&h.freelarge, s)
+	}
+}
+
+func scavengelist(list *mspan, now, limit uint64) uintptr {
+	if mSpanList_IsEmpty(list) {
+		return 0
+	}
+
+	var sumreleased uintptr
+	for s := list.next; s != list; s = s.next {
+		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
+			released := (s.npages - s.npreleased) << _PageShift
+			memstats.heap_released += uint64(released)
+			sumreleased += released
+			s.npreleased = s.npages
+			sysUnused((unsafe.Pointer)(s.start<<_PageShift), s.npages<<_PageShift)
+		}
+	}
+	return sumreleased
+}
+
+func mHeap_Scavenge(k int32, now, limit uint64) {
+	h := &mheap_
+	lock(&h.lock)
+	var sumreleased uintptr
+	for i := 0; i < len(h.free); i++ {
+		sumreleased += scavengelist(&h.free[i], now, limit)
+	}
+	sumreleased += scavengelist(&h.freelarge, now, limit)
+	unlock(&h.lock)
+
+	if debug.gctrace > 0 {
+		if sumreleased > 0 {
+			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
+		}
+		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
+		// But we can't call ReadMemStats on g0 holding locks.
+		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+	}
+}
+
+func scavenge_m() {
+	mHeap_Scavenge(-1, ^uint64(0), 0)
+}
+
+// Initialize a new span with the given start and npages.
+func mSpan_Init(span *mspan, start pageID, npages uintptr) {
+	span.next = nil
+	span.prev = nil
+	span.start = start
+	span.npages = npages
+	span.freelist = nil
+	span.ref = 0
+	span.sizeclass = 0
+	span.incache = false
+	span.elemsize = 0
+	span.state = _MSpanDead
+	span.unusedsince = 0
+	span.npreleased = 0
+	span.speciallock.key = 0
+	span.specials = nil
+	span.needzero = 0
+}
+
+// Initialize an empty doubly-linked list.
+func mSpanList_Init(list *mspan) {
+	list.state = _MSpanListHead
+	list.next = list
+	list.prev = list
+}
+
+func mSpanList_Remove(span *mspan) {
+	if span.prev == nil && span.next == nil {
+		return
+	}
+	span.prev.next = span.next
+	span.next.prev = span.prev
+	span.prev = nil
+	span.next = nil
+}
+
+func mSpanList_IsEmpty(list *mspan) bool {
+	return list.next == list
+}
+
+func mSpanList_Insert(list *mspan, span *mspan) {
+	if span.next != nil || span.prev != nil {
+		println("failed MSpanList_Insert", span, span.next, span.prev)
+		gothrow("MSpanList_Insert")
+	}
+	span.next = list.next
+	span.prev = list
+	span.next.prev = span
+	span.prev.next = span
+}
+
+func mSpanList_InsertBack(list *mspan, span *mspan) {
+	if span.next != nil || span.prev != nil {
+		println("failed MSpanList_InsertBack", span, span.next, span.prev)
+		gothrow("MSpanList_InsertBack")
+	}
+	span.next = list
+	span.prev = list.prev
+	span.next.prev = span
+	span.prev.next = span
+}
+
+// Adds the special record s to the list of special records for
+// the object p.  All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+func addspecial(p unsafe.Pointer, s *special) bool {
+	span := mHeap_LookupMaybe(&mheap_, p)
+	if span == nil {
+		gothrow("addspecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	mp := acquirem()
+	mSpan_EnsureSwept(span)
+
+	offset := uintptr(p) - uintptr(span.start<<_PageShift)
+	kind := s.kind
+
+	lock(&span.speciallock)
+
+	// Find splice point, check for existing record.
+	t := &span.specials
+	for {
+		x := *t
+		if x == nil {
+			break
+		}
+		if offset == uintptr(x.offset) && kind == x.kind {
+			unlock(&span.speciallock)
+			releasem(mp)
+			return false // already exists
+		}
+		if offset < uintptr(x.offset) || (offset == uintptr(x.offset) && kind < x.kind) {
+			break
+		}
+		t = &x.next
+	}
+
+	// Splice in record, fill in offset.
+	s.offset = uint16(offset)
+	s.next = *t
+	*t = s
+	unlock(&span.speciallock)
+	releasem(mp)
+
+	return true
+}
+
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+func removespecial(p unsafe.Pointer, kind uint8) *special {
+	span := mHeap_LookupMaybe(&mheap_, p)
+	if span == nil {
+		gothrow("removespecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	mp := acquirem()
+	mSpan_EnsureSwept(span)
+
+	offset := uintptr(p) - uintptr(span.start<<_PageShift)
+
+	lock(&span.speciallock)
+	t := &span.specials
+	for {
+		s := *t
+		if s == nil {
+			break
+		}
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if offset == uintptr(s.offset) && kind == s.kind {
+			*t = s.next
+			unlock(&span.speciallock)
+			releasem(mp)
+			return s
+		}
+		t = &s.next
+	}
+	unlock(&span.speciallock)
+	releasem(mp)
+	return nil
+}
+
+// Adds a finalizer to the object p.  Returns true if it succeeded.
+func addfinalizer(p unsafe.Pointer, f *funcval, nret uintptr, fint *_type, ot *ptrtype) bool {
+	lock(&mheap_.speciallock)
+	s := (*specialfinalizer)(fixAlloc_Alloc(&mheap_.specialfinalizeralloc))
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialFinalizer
+	s.fn = f
+	s.nret = nret
+	s.fint = fint
+	s.ot = ot
+	if addspecial(p, &s.special) {
+		return true
+	}
+
+	// There was an old finalizer
+	lock(&mheap_.speciallock)
+	fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(s))
+	unlock(&mheap_.speciallock)
+	return false
+}
+
+// Removes the finalizer (if any) from the object p.
+func removefinalizer(p unsafe.Pointer) {
+	s := (*specialfinalizer)(unsafe.Pointer(removespecial(p, _KindSpecialFinalizer)))
+	if s == nil {
+		return // there wasn't a finalizer to remove
+	}
+	lock(&mheap_.speciallock)
+	fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(s))
+	unlock(&mheap_.speciallock)
+}
+
+// Set the heap profile bucket associated with addr to b.
+func setprofilebucket(p unsafe.Pointer, b *bucket) {
+	lock(&mheap_.speciallock)
+	s := (*specialprofile)(fixAlloc_Alloc(&mheap_.specialprofilealloc))
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialProfile
+	s.b = b
+	if !addspecial(p, &s.special) {
+		gothrow("setprofilebucket: profile already set")
+	}
+}
+
+// Do whatever cleanup needs to be done to deallocate s.  It has
+// already been unlinked from the MSpan specials list.
+// Returns true if we should keep working on deallocating p.
+func freespecial(s *special, p unsafe.Pointer, size uintptr, freed bool) bool {
+	switch s.kind {
+	case _KindSpecialFinalizer:
+		sf := (*specialfinalizer)(unsafe.Pointer(s))
+		queuefinalizer(p, sf.fn, sf.nret, sf.fint, sf.ot)
+		lock(&mheap_.speciallock)
+		fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(sf))
+		unlock(&mheap_.speciallock)
+		return false // don't free p until finalizer is done
+	case _KindSpecialProfile:
+		sp := (*specialprofile)(unsafe.Pointer(s))
+		mProf_Free(sp.b, size, freed)
+		lock(&mheap_.speciallock)
+		fixAlloc_Free(&mheap_.specialprofilealloc, (unsafe.Pointer)(sp))
+		unlock(&mheap_.speciallock)
+		return true
+	default:
+		gothrow("bad special kind")
+		panic("not reached")
+	}
+}

diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index d409c6c..6ff3374 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go

@@ -190,8 +190,6 @@
 	return b
 }
 
-func sysAlloc(n uintptr, stat *uint64) unsafe.Pointer
-
 func eqslice(x, y []uintptr) bool {
 	if len(x) != len(y) {
 		return false
@@ -246,16 +244,9 @@
 	// This reduces potential contention and chances of deadlocks.
 	// Since the object must be alive during call to mProf_Malloc,
 	// it's fine to do this non-atomically.
-	setprofilebucket(p, b)
-}
-
-func setprofilebucket_m() // mheap.c
-
-func setprofilebucket(p unsafe.Pointer, b *bucket) {
-	g := getg()
-	g.m.ptrarg[0] = p
-	g.m.ptrarg[1] = unsafe.Pointer(b)
-	onM(setprofilebucket_m)
+	onM(func() {
+		setprofilebucket(p, b)
+	})
 }
 
 // Called when freeing a profiled block.
@@ -519,8 +510,6 @@
 	return
 }
 
-var allgs []*g // proc.c
-
 // GoroutineProfile returns n, the number of records in the active goroutine stack profile.
 // If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
 // If len(p) < n, GoroutineProfile does not change p and returns n, false.

diff --git a/src/runtime/msize.c b/src/runtime/msize.c
deleted file mode 100644
index 7cb65da..0000000
--- a/src/runtime/msize.c
+++ /dev/null

@@ -1,184 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Malloc small size classes.
-//
-// See malloc.h for overview.
-//
-// The size classes are chosen so that rounding an allocation
-// request up to the next size class wastes at most 12.5% (1.125x).
-//
-// Each size class has its own page count that gets allocated
-// and chopped up when new objects of the size class are needed.
-// That page count is chosen so that chopping up the run of
-// pages into objects of the given size wastes at most 12.5% (1.125x)
-// of the memory.  It is not necessary that the cutoff here be
-// the same as above.
-//
-// The two sources of waste multiply, so the worst possible case
-// for the above constraints would be that allocations of some
-// size might have a 26.6% (1.266x) overhead.
-// In practice, only one of the wastes comes into play for a
-// given size (sizes < 512 waste mainly on the round-up,
-// sizes > 512 waste mainly on the page chopping).
-//
-// TODO(rsc): Compute max waste for any given size.
-
-#include "runtime.h"
-#include "arch_GOARCH.h"
-#include "malloc.h"
-#include "textflag.h"
-
-#pragma dataflag NOPTR
-int32 runtime·class_to_size[NumSizeClasses];
-#pragma dataflag NOPTR
-int32 runtime·class_to_allocnpages[NumSizeClasses];
-
-// The SizeToClass lookup is implemented using two arrays,
-// one mapping sizes <= 1024 to their class and one mapping
-// sizes >= 1024 and <= MaxSmallSize to their class.
-// All objects are 8-aligned, so the first array is indexed by
-// the size divided by 8 (rounded up).  Objects >= 1024 bytes
-// are 128-aligned, so the second array is indexed by the
-// size divided by 128 (rounded up).  The arrays are filled in
-// by InitSizes.
-
-#pragma dataflag NOPTR
-int8 runtime·size_to_class8[1024/8 + 1];
-#pragma dataflag NOPTR
-int8 runtime·size_to_class128[(MaxSmallSize-1024)/128 + 1];
-
-void runtime·testdefersizes(void);
-
-int32
-runtime·SizeToClass(int32 size)
-{
-	if(size > MaxSmallSize)
-		runtime·throw("SizeToClass - invalid size");
-	if(size > 1024-8)
-		return runtime·size_to_class128[(size-1024+127) >> 7];
-	return runtime·size_to_class8[(size+7)>>3];
-}
-
-void
-runtime·InitSizes(void)
-{
-	int32 align, sizeclass, size, nextsize, n;
-	uint32 i;
-	uintptr allocsize, npages;
-
-	// Initialize the runtime·class_to_size table (and choose class sizes in the process).
-	runtime·class_to_size[0] = 0;
-	sizeclass = 1;	// 0 means no class
-	align = 8;
-	for(size = align; size <= MaxSmallSize; size += align) {
-		if((size&(size-1)) == 0) {	// bump alignment once in a while
-			if(size >= 2048)
-				align = 256;
-			else if(size >= 128)
-				align = size / 8;
-			else if(size >= 16)
-				align = 16;	// required for x86 SSE instructions, if we want to use them
-		}
-		if((align&(align-1)) != 0)
-			runtime·throw("InitSizes - bug");
-
-		// Make the allocnpages big enough that
-		// the leftover is less than 1/8 of the total,
-		// so wasted space is at most 12.5%.
-		allocsize = PageSize;
-		while(allocsize%size > allocsize/8)
-			allocsize += PageSize;
-		npages = allocsize >> PageShift;
-
-		// If the previous sizeclass chose the same
-		// allocation size and fit the same number of
-		// objects into the page, we might as well
-		// use just this size instead of having two
-		// different sizes.
-		if(sizeclass > 1 &&
-			npages == runtime·class_to_allocnpages[sizeclass-1] &&
-			allocsize/size == allocsize/runtime·class_to_size[sizeclass-1]) {
-			runtime·class_to_size[sizeclass-1] = size;
-			continue;
-		}
-
-		runtime·class_to_allocnpages[sizeclass] = npages;
-		runtime·class_to_size[sizeclass] = size;
-		sizeclass++;
-	}
-	if(sizeclass != NumSizeClasses) {
-		runtime·printf("sizeclass=%d NumSizeClasses=%d\n", sizeclass, NumSizeClasses);
-		runtime·throw("InitSizes - bad NumSizeClasses");
-	}
-
-	// Initialize the size_to_class tables.
-	nextsize = 0;
-	for (sizeclass = 1; sizeclass < NumSizeClasses; sizeclass++) {
-		for(; nextsize < 1024 && nextsize <= runtime·class_to_size[sizeclass]; nextsize+=8)
-			runtime·size_to_class8[nextsize/8] = sizeclass;
-		if(nextsize >= 1024)
-			for(; nextsize <= runtime·class_to_size[sizeclass]; nextsize += 128)
-				runtime·size_to_class128[(nextsize-1024)/128] = sizeclass;
-	}
-
-	// Double-check SizeToClass.
-	if(0) {
-		for(n=0; n < MaxSmallSize; n++) {
-			sizeclass = runtime·SizeToClass(n);
-			if(sizeclass < 1 || sizeclass >= NumSizeClasses || runtime·class_to_size[sizeclass] < n) {
-				runtime·printf("size=%d sizeclass=%d runtime·class_to_size=%d\n", n, sizeclass, runtime·class_to_size[sizeclass]);
-				runtime·printf("incorrect SizeToClass");
-				goto dump;
-			}
-			if(sizeclass > 1 && runtime·class_to_size[sizeclass-1] >= n) {
-				runtime·printf("size=%d sizeclass=%d runtime·class_to_size=%d\n", n, sizeclass, runtime·class_to_size[sizeclass]);
-				runtime·printf("SizeToClass too big");
-				goto dump;
-			}
-		}
-	}
-
-	runtime·testdefersizes();
-
-	// Copy out for statistics table.
-	for(i=0; i<nelem(runtime·class_to_size); i++)
-		mstats.by_size[i].size = runtime·class_to_size[i];
-	return;
-
-dump:
-	if(1){
-		runtime·printf("NumSizeClasses=%d\n", NumSizeClasses);
-		runtime·printf("runtime·class_to_size:");
-		for(sizeclass=0; sizeclass<NumSizeClasses; sizeclass++)
-			runtime·printf(" %d", runtime·class_to_size[sizeclass]);
-		runtime·printf("\n\n");
-		runtime·printf("size_to_class8:");
-		for(i=0; i<nelem(runtime·size_to_class8); i++)
-			runtime·printf(" %d=>%d(%d)\n", i*8, runtime·size_to_class8[i],
-				runtime·class_to_size[runtime·size_to_class8[i]]);
-		runtime·printf("\n");
-		runtime·printf("size_to_class128:");
-		for(i=0; i<nelem(runtime·size_to_class128); i++)
-			runtime·printf(" %d=>%d(%d)\n", i*128, runtime·size_to_class128[i],
-				runtime·class_to_size[runtime·size_to_class128[i]]);
-		runtime·printf("\n");
-	}
-	runtime·throw("InitSizes failed");
-}
-
-// Returns size of the memory block that mallocgc will allocate if you ask for the size.
-uintptr
-runtime·roundupsize(uintptr size)
-{
-	if(size < MaxSmallSize) {
-		if(size <= 1024-8)
-			return runtime·class_to_size[runtime·size_to_class8[(size+7)>>3]];
-		else
-			return runtime·class_to_size[runtime·size_to_class128[(size-1024+127) >> 7]];
-	}
-	if(size + PageSize < size)
-		return size;
-	return ROUND(size, PageSize);
-}

diff --git a/src/runtime/msize.go b/src/runtime/msize.go
new file mode 100644
index 0000000..aa2b43e
--- /dev/null
+++ b/src/runtime/msize.go

@@ -0,0 +1,174 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc small size classes.
+//
+// See malloc.h for overview.
+//
+// The size classes are chosen so that rounding an allocation
+// request up to the next size class wastes at most 12.5% (1.125x).
+//
+// Each size class has its own page count that gets allocated
+// and chopped up when new objects of the size class are needed.
+// That page count is chosen so that chopping up the run of
+// pages into objects of the given size wastes at most 12.5% (1.125x)
+// of the memory.  It is not necessary that the cutoff here be
+// the same as above.
+//
+// The two sources of waste multiply, so the worst possible case
+// for the above constraints would be that allocations of some
+// size might have a 26.6% (1.266x) overhead.
+// In practice, only one of the wastes comes into play for a
+// given size (sizes < 512 waste mainly on the round-up,
+// sizes > 512 waste mainly on the page chopping).
+//
+// TODO(rsc): Compute max waste for any given size.
+
+package runtime
+
+//var class_to_size [_NumSizeClasses]int32
+//var class_to_allocnpages [_NumSizeClasses]int32
+
+// The SizeToClass lookup is implemented using two arrays,
+// one mapping sizes <= 1024 to their class and one mapping
+// sizes >= 1024 and <= MaxSmallSize to their class.
+// All objects are 8-aligned, so the first array is indexed by
+// the size divided by 8 (rounded up).  Objects >= 1024 bytes
+// are 128-aligned, so the second array is indexed by the
+// size divided by 128 (rounded up).  The arrays are filled in
+// by InitSizes.
+//var size_to_class8 [1024/8 + 1]int8
+//var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
+
+func sizeToClass(size int32) int32 {
+	if size > _MaxSmallSize {
+		gothrow("SizeToClass - invalid size")
+	}
+	if size > 1024-8 {
+		return int32(size_to_class128[(size-1024+127)>>7])
+	}
+	return int32(size_to_class8[(size+7)>>3])
+}
+
+func initSizes() {
+	// Initialize the runtime·class_to_size table (and choose class sizes in the process).
+	class_to_size[0] = 0
+	sizeclass := 1 // 0 means no class
+	align := 8
+	for size := align; size <= _MaxSmallSize; size += align {
+		if size&(size-1) == 0 { // bump alignment once in a while
+			if size >= 2048 {
+				align = 256
+			} else if size >= 128 {
+				align = size / 8
+			} else if size >= 16 {
+				align = 16 // required for x86 SSE instructions, if we want to use them
+			}
+		}
+		if align&(align-1) != 0 {
+			gothrow("InitSizes - bug")
+		}
+
+		// Make the allocnpages big enough that
+		// the leftover is less than 1/8 of the total,
+		// so wasted space is at most 12.5%.
+		allocsize := _PageSize
+		for allocsize%size > allocsize/8 {
+			allocsize += _PageSize
+		}
+		npages := allocsize >> _PageShift
+
+		// If the previous sizeclass chose the same
+		// allocation size and fit the same number of
+		// objects into the page, we might as well
+		// use just this size instead of having two
+		// different sizes.
+		if sizeclass > 1 && npages == int(class_to_allocnpages[sizeclass-1]) && allocsize/size == allocsize/int(class_to_size[sizeclass-1]) {
+			class_to_size[sizeclass-1] = int32(size)
+			continue
+		}
+
+		class_to_allocnpages[sizeclass] = int32(npages)
+		class_to_size[sizeclass] = int32(size)
+		sizeclass++
+	}
+	if sizeclass != _NumSizeClasses {
+		print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n")
+		gothrow("InitSizes - bad NumSizeClasses")
+	}
+
+	// Initialize the size_to_class tables.
+	nextsize := 0
+	for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ {
+		for ; nextsize < 1024 && nextsize <= int(class_to_size[sizeclass]); nextsize += 8 {
+			size_to_class8[nextsize/8] = int8(sizeclass)
+		}
+		if nextsize >= 1024 {
+			for ; nextsize <= int(class_to_size[sizeclass]); nextsize += 128 {
+				size_to_class128[(nextsize-1024)/128] = int8(sizeclass)
+			}
+		}
+	}
+
+	// Double-check SizeToClass.
+	if false {
+		for n := int32(0); n < _MaxSmallSize; n++ {
+			sizeclass := sizeToClass(n)
+			if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n {
+				print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
+				print("incorrect SizeToClass\n")
+				goto dump
+			}
+			if sizeclass > 1 && class_to_size[sizeclass-1] >= n {
+				print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
+				print("SizeToClass too big\n")
+				goto dump
+			}
+		}
+	}
+
+	testdefersizes()
+
+	// Copy out for statistics table.
+	for i := 0; i < len(class_to_size); i++ {
+		memstats.by_size[i].size = uint32(class_to_size[i])
+	}
+	return
+
+dump:
+	if true {
+		print("NumSizeClasses=", _NumSizeClasses, "\n")
+		print("runtime·class_to_size:")
+		for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ {
+			print(" ", class_to_size[sizeclass], "")
+		}
+		print("\n\n")
+		print("size_to_class8:")
+		for i := 0; i < len(size_to_class8); i++ {
+			print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n")
+		}
+		print("\n")
+		print("size_to_class128:")
+		for i := 0; i < len(size_to_class128); i++ {
+			print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n")
+		}
+		print("\n")
+	}
+	gothrow("InitSizes failed")
+}
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+func roundupsize(size uintptr) uintptr {
+	if size < _MaxSmallSize {
+		if size <= 1024-8 {
+			return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
+		} else {
+			return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
+		}
+	}
+	if size+_PageSize < size {
+		return size
+	}
+	return round(size, _PageSize)
+}

diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 171087d..93cea5c 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go

@@ -22,11 +22,11 @@
 	// but since the cap is only being supplied implicitly, saying len is clearer.
 	// See issue 4085.
 	len := int(len64)
-	if len64 < 0 || int64(len) != len64 || t.elem.size > 0 && uintptr(len) > maxmem/uintptr(t.elem.size) {
+	if len64 < 0 || int64(len) != len64 || t.elem.size > 0 && uintptr(len) > _MaxMem/uintptr(t.elem.size) {
 		panic(errorString("makeslice: len out of range"))
 	}
 	cap := int(cap64)
-	if cap < len || int64(cap) != cap64 || t.elem.size > 0 && uintptr(cap) > maxmem/uintptr(t.elem.size) {
+	if cap < len || int64(cap) != cap64 || t.elem.size > 0 && uintptr(cap) > _MaxMem/uintptr(t.elem.size) {
 		panic(errorString("makeslice: cap out of range"))
 	}
 	p := newarray(t.elem, uintptr(cap))
@@ -42,7 +42,7 @@
 	cap64 := int64(old.cap) + n
 	cap := int(cap64)
 
-	if int64(cap) != cap64 || cap < old.cap || t.elem.size > 0 && uintptr(cap) > maxmem/uintptr(t.elem.size) {
+	if int64(cap) != cap64 || cap < old.cap || t.elem.size > 0 && uintptr(cap) > _MaxMem/uintptr(t.elem.size) {
 		panic(errorString("growslice: cap out of range"))
 	}
 
@@ -72,7 +72,7 @@
 		}
 	}
 
-	if uintptr(newcap) >= maxmem/uintptr(et.size) {
+	if uintptr(newcap) >= _MaxMem/uintptr(et.size) {
 		panic(errorString("growslice: cap out of range"))
 	}
 	lenmem := uintptr(old.len) * uintptr(et.size)

diff --git a/src/runtime/string.go b/src/runtime/string.go
index 0809f89..0845c94 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go

@@ -225,7 +225,7 @@
 
 // rawruneslice allocates a new rune slice. The rune slice is not zeroed.
 func rawruneslice(size int) (b []rune) {
-	if uintptr(size) > maxmem/4 {
+	if uintptr(size) > _MaxMem/4 {
 		gothrow("out of memory")
 	}
 	mem := goroundupsize(uintptr(size) * 4)
@@ -255,9 +255,6 @@
 	return s
 }
 
-//go:noescape
-func findnull(*byte) int
-
 func gostring(p *byte) string {
 	l := findnull(p)
 	if l == 0 {
@@ -296,3 +293,12 @@
 func hasprefix(s, t string) bool {
 	return len(s) >= len(t) && s[:len(t)] == t
 }
+
+func goatoi(s string) int {
+	n := 0
+	for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+		n = n*10 + int(s[0]) - '0'
+		s = s[1:]
+	}
+	return n
+}
commit	1e2d2f09470a2be58a98420b4cecae731b156ee8	[log] [tgz]
author	Russ Cox <rsc@golang.org>	Tue Nov 11 17:05:02 2014 -0500
committer	Russ Cox <rsc@golang.org>	Tue Nov 11 17:05:02 2014 -0500
tree	4e787523cc5b7e166ff675e1d3da6b1da2207170
parent	d98553a72782efb2d96c6d6f5e12869826a56779 [diff]