runtime: faster allocator, garbage collector

GC is still single-threaded.
Multiple threads will happen in another CL.

Garbage collection pauses are typically
about half as long as they were before this CL.

R=brainman, iant, r
CC=golang-dev
https://golang.org/cl/3975046
diff --git a/src/pkg/runtime/debug.go b/src/pkg/runtime/debug.go
index d09db1b..5117e1a 100644
--- a/src/pkg/runtime/debug.go
+++ b/src/pkg/runtime/debug.go
@@ -69,7 +69,8 @@
 
 	// Per-size allocation statistics.
 	// Not locked during update; approximate.
-	BySize [67]struct {
+	// 61 is NumSizeClasses in the C code.
+	BySize [61]struct {
 		Size    uint32
 		Mallocs uint64
 		Frees   uint64
diff --git a/src/pkg/runtime/iface.c b/src/pkg/runtime/iface.c
index aa36df68..3dec45e 100644
--- a/src/pkg/runtime/iface.c
+++ b/src/pkg/runtime/iface.c
@@ -702,7 +702,7 @@
 	t = (Type*)((Eface*)typ.data-1);
 
 	if(t->kind&KindNoPointers)
-		ret = runtime·mallocgc(t->size, RefNoPointers, 1, 1);
+		ret = runtime·mallocgc(t->size, FlagNoPointers, 1, 1);
 	else
 		ret = runtime·mal(t->size);
 	FLUSH(&ret);
@@ -722,7 +722,7 @@
 	
 	size = n*t->size;
 	if(t->kind&KindNoPointers)
-		ret = runtime·mallocgc(size, RefNoPointers, 1, 1);
+		ret = runtime·mallocgc(size, FlagNoPointers, 1, 1);
 	else
 		ret = runtime·mal(size);
 	FLUSH(&ret);
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index cc28b94..8899b01 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -36,14 +36,13 @@
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
 void*
-runtime·mallocgc(uintptr size, uint32 refflag, int32 dogc, int32 zeroed)
+runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 {
 	int32 sizeclass, rate;
 	MCache *c;
 	uintptr npages;
 	MSpan *s;
 	void *v;
-	uint32 *ref;
 
 	if(runtime·gcwaiting && g != m->g0 && m->locks == 0)
 		runtime·gosched();
@@ -65,12 +64,6 @@
 		mstats.alloc += size;
 		mstats.total_alloc += size;
 		mstats.by_size[sizeclass].nmalloc++;
-
-		if(!runtime·mlookup(v, nil, nil, nil, &ref)) {
-			runtime·printf("malloc %D; runtime·mlookup failed\n", (uint64)size);
-			runtime·throw("malloc runtime·mlookup");
-		}
-		*ref = RefNone | refflag;
 	} else {
 		// TODO(rsc): Report tracebacks for very large allocations.
 
@@ -87,13 +80,14 @@
 		v = (void*)(s->start << PageShift);
 
 		// setup for mark sweep
-		s->gcref0 = RefNone | refflag;
-		ref = &s->gcref0;
+		runtime·markspan(v, 0, 0, true);
 	}
+	if(!(flag & FlagNoGC))
+		runtime·markallocated(v, size, (flag&FlagNoPointers) != 0);
 
 	m->mallocing = 0;
 
-	if(!(refflag & RefNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
+	if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
 		if(size >= rate)
 			goto profile;
 		if(m->mcache->next_sample > size)
@@ -104,7 +98,7 @@
 				rate = 0x3fffffff;
 			m->mcache->next_sample = fastrand1() % (2*rate);
 		profile:
-			*ref |= RefProfiled;
+			runtime·setblockspecial(v);
 			runtime·MProf_Malloc(v, size);
 		}
 	}
@@ -124,33 +118,35 @@
 void
 runtime·free(void *v)
 {
-	int32 sizeclass, size;
+	int32 sizeclass;
 	MSpan *s;
 	MCache *c;
-	uint32 prof, *ref;
+	uint32 prof;
+	uintptr size;
 
 	if(v == nil)
 		return;
+	
+	// If you change this also change mgc0.c:/^sweepspan,
+	// which has a copy of the guts of free.
 
 	if(m->mallocing)
 		runtime·throw("malloc/free - deadlock");
 	m->mallocing = 1;
 
-	if(!runtime·mlookup(v, nil, nil, &s, &ref)) {
+	if(!runtime·mlookup(v, nil, nil, &s)) {
 		runtime·printf("free %p: not an allocated block\n", v);
 		runtime·throw("free runtime·mlookup");
 	}
-	prof = *ref & RefProfiled;
-	*ref = RefFree;
+	prof = runtime·blockspecial(v);
 
 	// Find size class for v.
 	sizeclass = s->sizeclass;
 	if(sizeclass == 0) {
 		// Large object.
-		if(prof)
-			runtime·MProf_Free(v, s->npages<<PageShift);
-		mstats.alloc -= s->npages<<PageShift;
-		runtime·memclr(v, s->npages<<PageShift);
+		size = s->npages<<PageShift;
+		*(uintptr*)(s->start<<PageShift) = 1;	// mark as "needs to be zeroed"
+		runtime·unmarkspan(v, 1<<PageShift);
 		runtime·MHeap_Free(&runtime·mheap, s, 1);
 	} else {
 		// Small object.
@@ -158,19 +154,20 @@
 		size = runtime·class_to_size[sizeclass];
 		if(size > sizeof(uintptr))
 			((uintptr*)v)[1] = 1;	// mark as "needs to be zeroed"
-		if(prof)
-			runtime·MProf_Free(v, size);
-		mstats.alloc -= size;
 		mstats.by_size[sizeclass].nfree++;
 		runtime·MCache_Free(c, v, sizeclass, size);
 	}
+	runtime·markfreed(v, size);
+	mstats.alloc -= size;
+	if(prof)
+		runtime·MProf_Free(v, size);
 	m->mallocing = 0;
 }
 
 int32
-runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp, uint32 **ref)
+runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 {
-	uintptr n, nobj, i;
+	uintptr n, i;
 	byte *p;
 	MSpan *s;
 
@@ -179,12 +176,11 @@
 	if(sp)
 		*sp = s;
 	if(s == nil) {
+		runtime·checkfreed(v, 1);
 		if(base)
 			*base = nil;
 		if(size)
 			*size = 0;
-		if(ref)
-			*ref = 0;
 		return 0;
 	}
 
@@ -195,14 +191,11 @@
 			*base = p;
 		if(size)
 			*size = s->npages<<PageShift;
-		if(ref)
-			*ref = &s->gcref0;
 		return 1;
 	}
 
-	if((byte*)v >= (byte*)s->gcref) {
-		// pointers into the gc ref counts
-		// do not count as pointers.
+	if((byte*)v >= (byte*)s->limit) {
+		// pointers past the last block do not count as pointers.
 		return 0;
 	}
 
@@ -213,21 +206,6 @@
 	if(size)
 		*size = n;
 
-	// good for error checking, but expensive
-	if(0) {
-		nobj = (s->npages << PageShift) / (n + RefcountOverhead);
-		if((byte*)s->gcref < p || (byte*)(s->gcref+nobj) > p+(s->npages<<PageShift)) {
-			runtime·printf("odd span state=%d span=%p base=%p sizeclass=%d n=%D size=%D npages=%D\n",
-				s->state, s, p, s->sizeclass, (uint64)nobj, (uint64)n, (uint64)s->npages);
-			runtime·printf("s->base sizeclass %d v=%p base=%p gcref=%p blocksize=%D nobj=%D size=%D end=%p end=%p\n",
-				s->sizeclass, v, p, s->gcref, (uint64)s->npages<<PageShift,
-				(uint64)nobj, (uint64)n, s->gcref + nobj, p+(s->npages<<PageShift));
-			runtime·throw("bad gcref");
-		}
-	}
-	if(ref)
-		*ref = &s->gcref[i];
-
 	return 1;
 }
 
@@ -246,14 +224,20 @@
 
 int32 runtime·sizeof_C_MStats = sizeof(MStats);
 
+#define MaxArena32 (2U<<30)
+
 void
 runtime·mallocinit(void)
 {
 	byte *p;
-	uintptr arena_size;
+	uintptr arena_size, bitmap_size;
+	extern byte end[];
 
 	runtime·InitSizes();
 
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found.  The arena begins with a bitmap large
+	// enough to hold 4 bits per allocated word.
 	if(sizeof(void*) == 8) {
 		// On a 64-bit machine, allocate from a single contiguous reservation.
 		// 16 GB should be big enough for now.
@@ -273,19 +257,44 @@
 		// odds of the conservative garbage collector not collecting memory
 		// because some non-pointer block of memory had a bit pattern
 		// that matched a memory address.
+		//
+		// Actually we reserve 17 GB (because the bitmap ends up being 1 GB)
+		// but it hardly matters: fc is not valid UTF-8 either, and we have to
+		// allocate 15 GB before we get that far.
 		arena_size = 16LL<<30;
-		p = runtime·SysReserve((void*)(0x00f8ULL<<32), arena_size);
+		bitmap_size = arena_size / (sizeof(void*)*8/4);
+		p = runtime·SysReserve((void*)(0x00f8ULL<<32), bitmap_size + arena_size);
 		if(p == nil)
 			runtime·throw("runtime: cannot reserve arena virtual address space");
-		runtime·mheap.arena_start = p;
-		runtime·mheap.arena_used = p;
-		runtime·mheap.arena_end = p + arena_size;
 	} else {
-		// On a 32-bit machine, we'll take what we can get for each allocation
-		// and maintain arena_start and arena_end as min, max we've seen.
-		runtime·mheap.arena_start = (byte*)0xffffffff;
-		runtime·mheap.arena_end = 0;
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle another 2GB of mappings (256 MB),
+		// along with a reservation for another 512 MB of memory.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere and hope it's in the 2GB
+		// following the bitmap (presumably the executable begins
+		// near the bottom of memory, so we'll have to use up
+		// most of memory before the kernel resorts to giving out
+		// memory before the beginning of the text segment).
+		//
+		// Alternatively we could reserve 512 MB bitmap, enough
+		// for 4GB of mappings, and then accept any memory the
+		// kernel threw at us, but normally that's a waste of 512 MB
+		// of address space, which is probably too much in a 32-bit world.
+		bitmap_size = MaxArena32 / (sizeof(void*)*8/4);
+		arena_size = 512<<20;
+
+		p = (void*)(((uintptr)end + 64*1024 - 1) & ~(64*1024-1));
+		if(runtime·SysReserve(p, bitmap_size + arena_size) != p)
+			runtime·throw("runtime: cannot reserve memory bitmap virtual address space");
 	}
+	runtime·mheap.bitmap = p;
+	runtime·mheap.arena_start = p + bitmap_size;
+	runtime·mheap.arena_used = runtime·mheap.arena_start;
+	runtime·mheap.arena_end = runtime·mheap.arena_start + arena_size;
 
 	// Initialize the rest of the allocator.	
 	runtime·MHeap_Init(&runtime·mheap, runtime·SysAlloc);
@@ -299,26 +308,41 @@
 runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
 {
 	byte *p;
-	
-	if(sizeof(void*) == 8) {
+
+	if(n <= h->arena_end - h->arena_used) {
 		// Keep taking from our reservation.
-		if(h->arena_end - h->arena_used < n)
-			return nil;
 		p = h->arena_used;
 		runtime·SysMap(p, n);
 		h->arena_used += n;
+		runtime·MHeap_MapBits(h);
 		return p;
-	} else {
-		// Take what we can get from the OS.
-		p = runtime·SysAlloc(n);
-		if(p == nil)
-			return nil;
-		if(p+n > h->arena_used)
-			h->arena_used = p+n;
-		if(p > h->arena_end)
-			h->arena_end = p;
-		return p;		
 	}
+	
+	// On 64-bit, our reservation is all we have.
+	if(sizeof(void*) == 8)
+		return nil;
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS
+	// and hope that it is in the range we allocated bitmap for.
+	p = runtime·SysAlloc(n);
+	if(p == nil)
+		return nil;
+
+	if(p < h->arena_start || p+n - h->arena_start >= MaxArena32) {
+		runtime·printf("runtime: memory allocated by OS not in usable range");
+		runtime·SysFree(p, n);
+		return nil;
+	}
+
+	if(p+n > h->arena_used) {
+		h->arena_used = p+n;
+		if(h->arena_used > h->arena_end)
+			h->arena_end = h->arena_used;
+		runtime·MHeap_MapBits(h);
+	}
+	
+	return p;
 }
 
 // Runtime stubs.
@@ -353,7 +377,6 @@
 runtime·stackalloc(uint32 n)
 {
 	void *v;
-	uint32 *ref;
 
 	if(m->mallocing || m->gcing || n == FixedStack) {
 		runtime·lock(&stacks);
@@ -369,11 +392,7 @@
 		runtime·unlock(&stacks);
 		return v;
 	}
-	v = runtime·mallocgc(n, RefNoProfiling, 0, 0);
-	if(!runtime·mlookup(v, nil, nil, nil, &ref))
-		runtime·throw("stackalloc runtime·mlookup");
-	*ref = RefStack;
-	return v;
+	return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
 }
 
 void
@@ -399,7 +418,7 @@
 }
 
 func Lookup(p *byte) (base *byte, size uintptr) {
-	runtime·mlookup(p, &base, &size, nil, nil);
+	runtime·mlookup(p, &base, &size, nil);
 }
 
 func GC() {
@@ -422,7 +441,7 @@
 		runtime·printf("runtime.SetFinalizer: first argument is %S, not pointer\n", *obj.type->string);
 		goto throw;
 	}
-	if(!runtime·mlookup(obj.data, &base, &size, nil, nil) || obj.data != base) {
+	if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) {
 		runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
 		goto throw;
 	}
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index e2472e8..4e27945 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -97,8 +97,14 @@
 
 enum
 {
+	// Computed constant.  The definition of MaxSmallSize and the
+	// algorithm in msize.c produce some number of different allocation
+	// size classes.  NumSizeClasses is that number.  It's needed here
+	// because there are static arrays of this length; when msize runs its
+	// size choosing algorithm it double-checks that NumSizeClasses agrees.
+	NumSizeClasses = 61,
+
 	// Tunable constants.
-	NumSizeClasses = 67,		// Number of size classes (must match msize.c)
 	MaxSmallSize = 32<<10,
 
 	FixAllocChunk = 128<<10,	// Chunk size for FixAlloc
@@ -290,10 +296,7 @@
 	uint32	ref;		// number of allocated objects in this span
 	uint32	sizeclass;	// size class
 	uint32	state;		// MSpanInUse etc
-	union {
-		uint32	*gcref;	// sizeclass > 0
-		uint32	gcref0;	// sizeclass == 0
-	};
+	byte	*limit;	// end of data in span
 };
 
 void	runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages);
@@ -336,6 +339,7 @@
 
 	// range of addresses we might see in the heap
 	byte *bitmap;
+	uintptr bitmap_mapped;
 	byte *arena_start;
 	byte *arena_used;
 	byte *arena_end;
@@ -359,26 +363,29 @@
 void	runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan*	runtime·MHeap_Lookup(MHeap *h, void *v);
 MSpan*	runtime·MHeap_LookupMaybe(MHeap *h, void *v);
-void	runtime·MGetSizeClassInfo(int32 sizeclass, int32 *size, int32 *npages, int32 *nobj);
+void	runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *size, int32 *npages, int32 *nobj);
 void*	runtime·MHeap_SysAlloc(MHeap *h, uintptr n);
+void	runtime·MHeap_MapBits(MHeap *h);
 
 void*	runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed);
-int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s, uint32 **ref);
+int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void	runtime·gc(int32 force);
+void	runtime·markallocated(void *v, uintptr n, bool noptr);
+void	runtime·checkallocated(void *v, uintptr n);
+void	runtime·markfreed(void *v, uintptr n);
+void	runtime·checkfreed(void *v, uintptr n);
+int32	runtime·checking;
+void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
+void	runtime·unmarkspan(void *v, uintptr size);
+bool	runtime·blockspecial(void*);
+void	runtime·setblockspecial(void*);
 
 enum
 {
-	RefcountOverhead = 4,	// one uint32 per object
-
-	RefFree = 0,	// must be zero
-	RefStack,		// stack segment - don't free and don't scan for pointers
-	RefNone,		// no references
-	RefSome,		// some references
-	RefNoPointers = 0x80000000U,	// flag - no pointers here
-	RefHasFinalizer = 0x40000000U,	// flag - has finalizer
-	RefProfiled = 0x20000000U,	// flag - is in profiling table
-	RefNoProfiling = 0x10000000U,	// flag - must not profile
-	RefFlags = 0xFFFF0000U,
+	// flags to malloc
+	FlagNoPointers = 1<<0,	// no pointers here
+	FlagNoProfiling = 1<<1,	// must not profile
+	FlagNoGC = 1<<2,	// must not free or scan for pointers
 };
 
 void	runtime·MProf_Malloc(void*, uintptr);
diff --git a/src/pkg/runtime/mcentral.c b/src/pkg/runtime/mcentral.c
index f1ad119..29b03b5 100644
--- a/src/pkg/runtime/mcentral.c
+++ b/src/pkg/runtime/mcentral.c
@@ -113,8 +113,7 @@
 MCentral_Free(MCentral *c, void *v)
 {
 	MSpan *s;
-	PageID page;
-	MLink *p, *next;
+	MLink *p;
 	int32 size;
 
 	// Find span for v.
@@ -138,16 +137,8 @@
 	if(--s->ref == 0) {
 		size = runtime·class_to_size[c->sizeclass];
 		runtime·MSpanList_Remove(s);
-		// The second word of each freed block indicates
-		// whether it needs to be zeroed.  The first word
-		// is the link pointer and must always be cleared.
-		for(p=s->freelist; p; p=next) {
-			next = p->next;
-			if(size > sizeof(uintptr) && ((uintptr*)p)[1] != 0)
-				runtime·memclr((byte*)p, size);
-			else
-				p->next = nil;
-		}
+		runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+		*(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
 		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime·unlock(c);
@@ -157,7 +148,7 @@
 }
 
 void
-runtime·MGetSizeClassInfo(int32 sizeclass, int32 *sizep, int32 *npagesp, int32 *nobj)
+runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *sizep, int32 *npagesp, int32 *nobj)
 {
 	int32 size;
 	int32 npages;
@@ -166,7 +157,7 @@
 	size = runtime·class_to_size[sizeclass];
 	*npagesp = npages;
 	*sizep = size;
-	*nobj = (npages << PageShift) / (size + RefcountOverhead);
+	*nobj = (npages << PageShift) / size;
 }
 
 // Fetch a new span from the heap and
@@ -174,7 +165,8 @@
 static bool
 MCentral_Grow(MCentral *c)
 {
-	int32 i, n, npages, size;
+	int32 i, n, npages;
+	uintptr size;
 	MLink **tailp, *v;
 	byte *p;
 	MSpan *s;
@@ -191,7 +183,7 @@
 	// Carve span into sequence of blocks.
 	tailp = &s->freelist;
 	p = (byte*)(s->start << PageShift);
-	s->gcref = (uint32*)(p + size*n);
+	s->limit = p + size*n;
 	for(i=0; i<n; i++) {
 		v = (MLink*)p;
 		*tailp = v;
@@ -199,6 +191,7 @@
 		p += size;
 	}
 	*tailp = nil;
+	runtime·markspan((byte*)(s->start<<PageShift), size, n, size*n < (s->npages<<PageShift));
 
 	runtime·lock(c);
 	c->nfree += n;
diff --git a/src/pkg/runtime/mfinal.c b/src/pkg/runtime/mfinal.c
index f73561b..03ee777 100644
--- a/src/pkg/runtime/mfinal.c
+++ b/src/pkg/runtime/mfinal.c
@@ -5,6 +5,7 @@
 #include "runtime.h"
 #include "malloc.h"
 
+// TODO(rsc): Why not just use mheap.Lock?
 static Lock finlock;
 
 // Finalizer hash table.  Direct hash, linear scan, at most 3/4 full.
@@ -101,24 +102,21 @@
 	}
 
 	runtime·lock(&finlock);
-	if(!runtime·mlookup(p, &base, nil, nil, &ref) || p != base) {
+	if(!runtime·mlookup(p, &base, nil, nil) || p != base) {
 		runtime·unlock(&finlock);
 		runtime·throw("addfinalizer on invalid pointer");
 	}
 	if(f == nil) {
-		if(*ref & RefHasFinalizer) {
-			lookfintab(&fintab, p, 1);
-			*ref &= ~RefHasFinalizer;
-		}
+		lookfintab(&fintab, p, 1);
 		runtime·unlock(&finlock);
 		return;
 	}
 
-	if(*ref & RefHasFinalizer) {
+	if(lookfintab(&fintab, p, 0)) {
 		runtime·unlock(&finlock);
 		runtime·throw("double finalizer");
 	}
-	*ref |= RefHasFinalizer;
+	runtime·setblockspecial(p);
 
 	if(fintab.nkey >= fintab.max/2+fintab.max/4) {
 		// keep table at most 3/4 full:
@@ -134,7 +132,7 @@
 			newtab.max *= 3;
 		}
 
-		newtab.key = runtime·mallocgc(newtab.max*sizeof newtab.key[0], RefNoPointers, 0, 1);
+		newtab.key = runtime·mallocgc(newtab.max*sizeof newtab.key[0], FlagNoPointers, 0, 1);
 		newtab.val = runtime·mallocgc(newtab.max*sizeof newtab.val[0], 0, 0, 1);
 
 		for(i=0; i<fintab.max; i++) {
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index af1c721..232c6cd 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -2,28 +2,66 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Garbage collector -- step 0.
-//
-// Stop the world, mark and sweep garbage collector.
-// NOT INTENDED FOR PRODUCTION USE.
-//
-// A mark and sweep collector provides a way to exercise
-// and test the memory allocator and the stack walking machinery
-// without also needing to get reference counting
-// exactly right.
+// Garbage collector.
 
 #include "runtime.h"
 #include "malloc.h"
 
 enum {
-	Debug = 0
+	Debug = 0,
+	UseCas = 1,
+	PtrSize = sizeof(void*),
+	
+	// Four bits per word (see #defines below).
+	wordsPerBitmapWord = sizeof(void*)*8/4,
+	bitShift = sizeof(void*)*8/4,
 };
 
-typedef struct BlockList BlockList;
-struct BlockList
+// Bits in per-word bitmap.
+// #defines because enum might not be able to hold the values.
+//
+// Each word in the bitmap describes wordsPerBitmapWord words
+// of heap memory.  There are 4 bitmap bits dedicated to each heap word,
+// so on a 64-bit system there is one bitmap word per 16 heap words.
+// The bits in the word are packed together by type first, then by
+// heap location, so each 64-bit bitmap word consists of, from top to bottom,
+// the 16 bitSpecial bits for the corresponding heap words, then the 16 bitMarked bits,
+// then the 16 bitNoPointers/bitBlockBoundary bits, then the 16 bitAllocated bits.
+// This layout makes it easier to iterate over the bits of a given type.
+//
+// The bitmap starts at mheap.arena_start and extends *backward* from
+// there.  On a 64-bit system the off'th word in the arena is tracked by
+// the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
+// the only difference is that the divisor is 8.)
+//
+// To pull out the bits corresponding to a given pointer p, we use:
+//
+//	off = p - (uintptr*)mheap.arena_start;  // word offset
+//	b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
+//	shift = off % wordsPerBitmapWord
+//	bits = *b >> shift;
+//	/* then test bits & bitAllocated, bits & bitMarked, etc. */
+//
+#define bitAllocated		((uintptr)1<<(bitShift*0))
+#define bitNoPointers		((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
+#define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
+#define bitSpecial		((uintptr)1<<(bitShift*3))	/* when bitAllocated is set - has finalizer or being profiled */
+#define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set */
+
+#define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
+
+static uint64 nlookup;
+static uint64 nsizelookup;
+static uint64 naddrlookup;
+static uint64 nhandoff;
+static int32 gctrace;
+
+typedef struct Workbuf Workbuf;
+struct Workbuf
 {
-	byte *obj;
-	uintptr size;
+	Workbuf *next;
+	uintptr nw;
+	byte *w[2048-2];
 };
 
 extern byte data[];
@@ -33,72 +71,258 @@
 static G *fing;
 static Finalizer *finq;
 static int32 fingwait;
-static BlockList *bl, *ebl;
+static uint32 nfullwait;
 
 static void runfinq(void);
+static bool bitlookup(void*, uintptr**, uintptr*, int32*);
+static Workbuf* getempty(Workbuf*);
+static Workbuf* getfull(Workbuf*);
 
-enum {
-	PtrSize = sizeof(void*)
-};
-
+// scanblock scans a block of n bytes starting at pointer b for references
+// to other objects, scanning any it finds recursively until there are no
+// unscanned objects left.  Instead of using an explicit recursion, it keeps
+// a work list in the Workbuf* structures and loops in the main function
+// body.  Keeping an explicit work list is easier on the stack allocator and
+// more efficient.
 static void
 scanblock(byte *b, int64 n)
 {
-	int32 off;
-	void *obj;
-	uintptr size;
-	uint32 *refp, ref;
+	byte *obj, *arena_start, *p;
 	void **vp;
-	int64 i;
-	BlockList *w;
+	uintptr size, *bitp, bits, shift, i, j, x, xbits, off;
+	MSpan *s;
+	PageID k;
+	void **bw, **w, **ew;
+	Workbuf *wbuf;
 
-	w = bl;
-	w->obj = b;
-	w->size = n;
-	w++;
+	// Memory arena parameters.
+	arena_start = runtime·mheap.arena_start;
+	
+	wbuf = nil;  // current work buffer
+	ew = nil;  // end of work buffer
+	bw = nil;  // beginning of work buffer
+	w = nil;  // current pointer into work buffer
 
-	while(w > bl) {
-		w--;
-		b = w->obj;
-		n = w->size;
+	// Align b to a word boundary.
+	off = (uintptr)b & (PtrSize-1);
+	if(off != 0) {
+		b += PtrSize - off;
+		n -= PtrSize - off;
+	}
 
+	for(;;) {
+		// Each iteration scans the block b of length n, queueing pointers in
+		// the work buffer.
 		if(Debug > 1)
 			runtime·printf("scanblock %p %D\n", b, n);
-		off = (uint32)(uintptr)b & (PtrSize-1);
-		if(off) {
-			b += PtrSize - off;
-			n -= PtrSize - off;
-		}
-	
+
 		vp = (void**)b;
 		n /= PtrSize;
 		for(i=0; i<n; i++) {
-			obj = vp[i];
-			if(obj == nil)
+			obj = (byte*)vp[i];
+			
+			// Words outside the arena cannot be pointers.
+			if((byte*)obj < arena_start || (byte*)obj >= runtime·mheap.arena_used)
 				continue;
-			if(runtime·mheap.arena_start <= (byte*)obj && (byte*)obj < runtime·mheap.arena_end) {
-				if(runtime·mlookup(obj, &obj, &size, nil, &refp)) {
-					ref = *refp;
-					switch(ref & ~RefFlags) {
-					case RefNone:
-						if(Debug > 1)
-							runtime·printf("found at %p: ", &vp[i]);
-						*refp = RefSome | (ref & RefFlags);
-						if(!(ref & RefNoPointers)) {
-							if(w >= ebl)
-								runtime·throw("scanblock: garbage collection stack overflow");
-							w->obj = obj;
-							w->size = size;
-							w++;
-						}
-						break;
-					}
+			
+			// obj may be a pointer to a live object.
+			// Try to find the beginning of the object.
+			
+			// Round down to word boundary.
+			obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+
+			// Find bits for this word.
+			off = (uintptr*)obj - (uintptr*)arena_start;
+			bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+			shift = off % wordsPerBitmapWord;
+			xbits = *bitp;
+			bits = xbits >> shift;
+
+			// Pointing at the beginning of a block?
+			if((bits & (bitAllocated|bitBlockBoundary)) != 0)
+				goto found;
+
+			// Pointing just past the beginning?
+			// Scan backward a little to find a block boundary.
+			for(j=shift; j-->0; ) {
+				if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
+					obj = (byte*)obj - (shift-j)*PtrSize;
+					shift = j;
+					bits = xbits>>shift;
+					goto found;
 				}
 			}
+
+			// Otherwise consult span table to find beginning.
+			// (Manually inlined copy of MHeap_LookupMaybe.)
+			nlookup++;
+			naddrlookup++;
+			k = (uintptr)obj>>PageShift;
+			x = k;
+			if(sizeof(void*) == 8)
+				x -= (uintptr)arena_start>>PageShift;
+			s = runtime·mheap.map[x];
+			if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
+				continue;
+			p =  (byte*)((uintptr)s->start<<PageShift);
+			if(s->sizeclass == 0) {
+				obj = p;
+			} else {
+				if((byte*)obj >= (byte*)s->limit)
+					continue;
+				size = runtime·class_to_size[s->sizeclass];
+				int32 i = ((byte*)obj - p)/size;
+				obj = p+i*size;
+			}
+
+			// Now that we know the object header, reload bits.
+			off = (uintptr*)obj - (uintptr*)arena_start;
+			bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+			shift = off % wordsPerBitmapWord;
+			xbits = *bitp;
+			bits = xbits >> shift;
+
+		found:
+			// Now we have bits, bitp, and shift correct for
+			// obj pointing at the base of the object.
+			// If not allocated or already marked, done.
+			if((bits & bitAllocated) == 0 || (bits & bitMarked) != 0)
+				continue;
+			*bitp |= bitMarked<<shift;
+
+			// If object has no pointers, don't need to scan further.
+			if((bits & bitNoPointers) != 0)
+				continue;
+
+			// If buffer is full, get a new one.
+			if(w >= ew) {
+				wbuf = getempty(wbuf);
+				bw = wbuf->w;
+				w = bw;
+				ew = bw + nelem(wbuf->w);
+			}
+			*w++ = obj;
 		}
+		
+		// Done scanning [b, b+n).  Prepare for the next iteration of
+		// the loop by setting b and n to the parameters for the next block.
+
+		// Fetch b from the work buffers.
+		if(w <= bw) {
+			// Emptied our buffer: refill.
+			wbuf = getfull(wbuf);
+			if(wbuf == nil)
+				break;
+			bw = wbuf->w;
+			ew = wbuf->w + nelem(wbuf->w);
+			w = bw+wbuf->nw;
+		}
+		b = *--w;
+	
+		// Figure out n = size of b.  Start by loading bits for b.
+		off = (uintptr*)b - (uintptr*)arena_start;
+		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		xbits = *bitp;
+		bits = xbits >> shift;
+		
+		// Might be small; look for nearby block boundary.
+		// A block boundary is marked by either bitBlockBoundary
+		// or bitAllocated being set (see notes near their definition).
+		enum {
+			boundary = bitBlockBoundary|bitAllocated
+		};
+		// Look for a block boundary both after and before b
+		// in the same bitmap word.
+		//
+		// A block boundary j words after b is indicated by
+		//	bits>>j & boundary
+		// assuming shift+j < bitShift.  (If shift+j >= bitShift then
+		// we'll be bleeding other bit types like bitMarked into our test.)
+		// Instead of inserting the conditional shift+j < bitShift into the loop,
+		// we can let j range from 1 to bitShift as long as we first
+		// apply a mask to keep only the bits corresponding
+		// to shift+j < bitShift aka j < bitShift-shift.
+		bits &= (boundary<<(bitShift-shift)) - boundary;
+		
+		// A block boundary j words before b is indicated by
+		//	xbits>>(shift-j) & boundary
+		// (assuming shift >= j).  There is no cleverness here
+		// avoid the test, because when j gets too large the shift
+		// turns negative, which is undefined in C.		
+
+		for(j=1; j<bitShift; j++) {
+			if(((bits>>j)&boundary) != 0 || shift>=j && ((xbits>>(shift-j))&boundary) != 0) {
+				n = j*PtrSize;
+				goto scan;
+			}
+		}
+		
+		// Fall back to asking span about size class.
+		// (Manually inlined copy of MHeap_Lookup.)
+		nlookup++;
+		nsizelookup++;
+		x = (uintptr)b>>PageShift;
+		if(sizeof(void*) == 8)
+			x -= (uintptr)arena_start>>PageShift;
+		s = runtime·mheap.map[x];
+		if(s->sizeclass == 0)
+			n = s->npages<<PageShift;
+		else
+			n = runtime·class_to_size[s->sizeclass];
+	scan:;
 	}
 }
 
+static struct {
+	Workbuf	*full;
+	Workbuf	*empty;
+	byte	*chunk;
+	uintptr	nchunk;
+} work;
+
+// Get an empty work buffer off the work.empty list,
+// allocating new buffers as needed.
+static Workbuf*
+getempty(Workbuf *b)
+{
+	if(b != nil) {
+		b->nw = nelem(b->w);
+		b->next = work.full;
+		work.full = b;
+	}
+	b = work.empty;
+	if(b != nil) {
+		work.empty = b->next;
+		return b;
+	}
+	
+	if(work.nchunk < sizeof *b) {
+		work.nchunk = 1<<20;
+		work.chunk = runtime·SysAlloc(work.nchunk);
+	}
+	b = (Workbuf*)work.chunk;
+	work.chunk += sizeof *b;
+	work.nchunk -= sizeof *b;
+	return b;
+}
+
+// Get a full work buffer off the work.full list, or return nil.
+static Workbuf*
+getfull(Workbuf *b)
+{
+	if(b != nil) {
+		b->nw = 0;
+		b->next = work.empty;
+		work.empty = b;
+	}
+	b = work.full;
+	if(b != nil)
+		work.full = b->next;
+	return b;
+}
+
+// Scanstack calls scanblock on each of gp's stack segments.
 static void
 scanstack(G *gp)
 {
@@ -119,46 +343,26 @@
 	}
 }
 
+// Markfin calls scanblock on the blocks that have finalizers:
+// the things pointed at cannot be freed until the finalizers have run.
 static void
 markfin(void *v)
 {
 	uintptr size;
-	uint32 *refp;
 
 	size = 0;
-	refp = nil;
-	if(!runtime·mlookup(v, &v, &size, nil, &refp) || !(*refp & RefHasFinalizer))
+	if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v))
 		runtime·throw("mark - finalizer inconsistency");
-	
+
 	// do not mark the finalizer block itself.  just mark the things it points at.
 	scanblock(v, size);
 }
 
+// Mark 
 static void
 mark(void)
 {
 	G *gp;
-	uintptr blsize, nobj;
-
-	// Figure out how big an object stack we need.
-	// Get a new one if we need more than we have
-	// or we need significantly less than we have.
-	nobj = mstats.heap_objects;
-	if(nobj > ebl - bl || nobj < (ebl-bl)/4) {
-		if(bl != nil)
-			runtime·SysFree(bl, (byte*)ebl - (byte*)bl);
-		
-		// While we're allocated a new object stack,
-		// add 20% headroom and also round up to
-		// the nearest page boundary, since mmap
-		// will anyway.
-		nobj = nobj * 12/10;
-		blsize = nobj * sizeof *bl;
-		blsize = (blsize + 4095) & ~4095;
-		nobj = blsize / sizeof *bl;
-		bl = runtime·SysAlloc(blsize);
-		ebl = bl + nobj;
-	}
 
 	// mark data+bss.
 	// skip runtime·mheap itself, which has no interesting pointers
@@ -192,95 +396,83 @@
 	runtime·walkfintab(markfin);
 }
 
-// free RefNone, free & queue finalizers for RefNone|RefHasFinalizer, reset RefSome
-static void
-sweepspan(MSpan *s)
-{
-	int32 n, npages, size;
-	byte *p;
-	uint32 ref, *gcrefp, *gcrefep;
-	MCache *c;
-	Finalizer *f;
-
-	p = (byte*)(s->start << PageShift);
-	if(s->sizeclass == 0) {
-		// Large block.
-		ref = s->gcref0;
-		switch(ref & ~(RefFlags^RefHasFinalizer)) {
-		case RefNone:
-			// Free large object.
-			mstats.alloc -= s->npages<<PageShift;
-			mstats.nfree++;
-			runtime·memclr(p, s->npages<<PageShift);
-			if(ref & RefProfiled)
-				runtime·MProf_Free(p, s->npages<<PageShift);
-			s->gcref0 = RefFree;
-			runtime·MHeap_Free(&runtime·mheap, s, 1);
-			break;
-		case RefNone|RefHasFinalizer:
-			f = runtime·getfinalizer(p, 1);
-			if(f == nil)
-				runtime·throw("finalizer inconsistency");
-			f->arg = p;
-			f->next = finq;
-			finq = f;
-			ref &= ~RefHasFinalizer;
-			// fall through
-		case RefSome:
-		case RefSome|RefHasFinalizer:
-			s->gcref0 = RefNone | (ref&RefFlags);
-			break;
-		}
-		return;
-	}
-
-	// Chunk full of small blocks.
-	runtime·MGetSizeClassInfo(s->sizeclass, &size, &npages, &n);
-	gcrefp = s->gcref;
-	gcrefep = s->gcref + n;
-	for(; gcrefp < gcrefep; gcrefp++, p += size) {
-		ref = *gcrefp;
-		if(ref < RefNone)	// RefFree or RefStack
-			continue;
-		switch(ref & ~(RefFlags^RefHasFinalizer)) {
-		case RefNone:
-			// Free small object.
-			if(ref & RefProfiled)
-				runtime·MProf_Free(p, size);
-			*gcrefp = RefFree;
-			c = m->mcache;
-			if(size > sizeof(uintptr))
-				((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
-			mstats.alloc -= size;
-			mstats.nfree++;
-			mstats.by_size[s->sizeclass].nfree++;
-			runtime·MCache_Free(c, p, s->sizeclass, size);
-			break;
-		case RefNone|RefHasFinalizer:
-			f = runtime·getfinalizer(p, 1);
-			if(f == nil)
-				runtime·throw("finalizer inconsistency");
-			f->arg = p;
-			f->next = finq;
-			finq = f;
-			ref &= ~RefHasFinalizer;
-			// fall through
-		case RefSome:
-		case RefSome|RefHasFinalizer:
-			*gcrefp = RefNone | (ref&RefFlags);
-			break;
-		}
-	}
-}
-
+// Sweep frees or calls finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
 static void
 sweep(void)
 {
 	MSpan *s;
+	int32 cl, n, npages;
+	uintptr size;
+	byte *p;
+	MCache *c;
+	Finalizer *f;
 
-	for(s = runtime·mheap.allspans; s != nil; s = s->allnext)
-		if(s->state == MSpanInUse)
-			sweepspan(s);
+	for(s = runtime·mheap.allspans; s != nil; s = s->allnext) {
+		if(s->state != MSpanInUse)
+			continue;
+
+		p = (byte*)(s->start << PageShift);
+		cl = s->sizeclass;
+		if(cl == 0) {
+			size = s->npages<<PageShift;
+			n = 1;
+		} else {
+			// Chunk full of small blocks.
+			size = runtime·class_to_size[cl];
+			npages = runtime·class_to_allocnpages[cl];
+			n = (npages << PageShift) / size;
+		}
+	
+		// sweep through n objects of given size starting at p.
+		for(; n > 0; n--, p += size) {
+			uintptr off, *bitp, shift, bits;
+
+			off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;
+			bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+			shift = off % wordsPerBitmapWord;
+			bits = *bitp>>shift;
+
+			if((bits & bitAllocated) == 0)
+				continue;
+
+			if((bits & bitMarked) != 0) {
+				*bitp &= ~(bitMarked<<shift);
+				continue;
+			}
+
+			if((bits & bitSpecial) != 0) {
+				// Special means it has a finalizer or is being profiled.
+				f = runtime·getfinalizer(p, 1);
+				if(f != nil) {
+					f->arg = p;
+					f->next = finq;
+					finq = f;
+					continue;
+				}
+				runtime·MProf_Free(p, size);
+			}
+
+			// Mark freed; restore block boundary bit.
+			*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
+
+			if(s->sizeclass == 0) {
+				// Free large span.
+				runtime·unmarkspan(p, 1<<PageShift);
+				*(uintptr*)p = 1;	// needs zeroing
+				runtime·MHeap_Free(&runtime·mheap, s, 1);
+			} else {
+				// Free small object.
+				c = m->mcache;
+				if(size > sizeof(uintptr))
+					((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
+				mstats.by_size[s->sizeclass].nfree++;
+				runtime·MCache_Free(c, p, s->sizeclass, size);
+			}
+			mstats.alloc -= size;
+			mstats.nfree++;
+		}
+	}
 }
 
 // Semaphore, not Lock, so that the goroutine
@@ -326,7 +518,8 @@
 void
 runtime·gc(int32 force)
 {
-	int64 t0, t1;
+	int64 t0, t1, t2, t3;
+	uint64 heap0, heap1, obj0, obj1;
 	byte *p;
 	Finalizer *fp;
 
@@ -349,23 +542,41 @@
 			gcpercent = -1;
 		else
 			gcpercent = runtime·atoi(p);
+		
+		p = runtime·getenv("GOGCTRACE");
+		if(p != nil)
+			gctrace = runtime·atoi(p);
 	}
 	if(gcpercent < 0)
 		return;
 
 	runtime·semacquire(&gcsema);
+	if(!force && mstats.heap_alloc < mstats.next_gc) {
+		runtime·semrelease(&gcsema);
+		return;
+	}
+
 	t0 = runtime·nanotime();
+	nlookup = 0;
+	nsizelookup = 0;
+	naddrlookup = 0;
+
 	m->gcing = 1;
 	runtime·stoptheworld();
 	if(runtime·mheap.Lock.key != 0)
 		runtime·throw("runtime·mheap locked during gc");
-	if(force || mstats.heap_alloc >= mstats.next_gc) {
-		cachestats();
-		mark();
-		sweep();
-		stealcache();
-		mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
-	}
+
+	cachestats();
+	heap0 = mstats.heap_alloc;
+	obj0 = mstats.nmalloc - mstats.nfree;
+
+	mark();
+	t1 = runtime·nanotime();
+	sweep();
+	t2 = runtime·nanotime();
+	stealcache();
+
+	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 	m->gcing = 0;
 
 	m->locks++;	// disable gc during the mallocs in newproc
@@ -381,18 +592,34 @@
 	}
 	m->locks--;
 
-	t1 = runtime·nanotime();
+	cachestats();
+	heap1 = mstats.heap_alloc;
+	obj1 = mstats.nmalloc - mstats.nfree;
+
+	t3 = runtime·nanotime();
+	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0;
+	mstats.pause_total_ns += t3 - t0;
 	mstats.numgc++;
-	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t1 - t0;
-	mstats.pause_total_ns += t1 - t0;
 	if(mstats.debuggc)
-		runtime·printf("pause %D\n", t1-t0);
+		runtime·printf("pause %D\n", t3-t0);
+	
+	if(gctrace) {
+		runtime·printf("gc%d: %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr)\n",
+			mstats.numgc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
+			heap0>>20, heap1>>20, obj0, obj1,
+			mstats.nmalloc, mstats.nfree,
+			nlookup, nsizelookup, naddrlookup);
+	}
+
 	runtime·semrelease(&gcsema);
 	runtime·starttheworld();
 	
 	// give the queued finalizers, if any, a chance to run
 	if(fp != nil)
 		runtime·gosched();
+	
+	if(gctrace > 1 && !force)
+		runtime·gc(1);
 }
 
 static void
@@ -430,3 +657,157 @@
 		runtime·gc(1);	// trigger another gc to clean up the finalized objects, if possible
 	}
 }
+
+// mark the block at v of size n as allocated.
+// If noptr is true, mark it as having no pointers.
+void
+runtime·markallocated(void *v, uintptr n, bool noptr)
+{
+	uintptr *b, bits, off, shift;
+
+	if(0)
+		runtime·printf("markallocated %p+%p\n", v, n);
+
+	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markallocated: bad pointer");
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+
+	bits = (*b & ~(bitMask<<shift)) | (bitAllocated<<shift);
+	if(noptr)
+		bits |= bitNoPointers<<shift;
+	*b = bits;
+}
+
+// mark the block at v of size n as freed.
+void
+runtime·markfreed(void *v, uintptr n)
+{
+	uintptr *b, off, shift;
+
+	if(0)
+		runtime·printf("markallocated %p+%p\n", v, n);
+
+	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markallocated: bad pointer");
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+
+	*b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
+}
+
+// check that the block at v of size n is marked freed.
+void
+runtime·checkfreed(void *v, uintptr n)
+{
+	uintptr *b, bits, off, shift;
+
+	if(!runtime·checking)
+		return;
+
+	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		return;	// not allocated, so okay
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+
+	bits = *b>>shift;
+	if((bits & bitAllocated) != 0) {
+		runtime·printf("checkfreed %p+%p: off=%p have=%p\n",
+			v, n, off, bits & bitMask);
+		runtime·throw("checkfreed: not freed");
+	}
+}
+
+// mark the span of memory at v as having n blocks of the given size.
+// if leftover is true, there is left over space at the end of the span.
+void
+runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
+{
+	uintptr *b, off, shift;
+	byte *p;
+
+	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markspan: bad pointer");
+
+	p = v;
+	if(leftover)	// mark a boundary just past end of last block too
+		n++;
+	for(; n-- > 0; p += size) {
+		off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
+		b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		*b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
+	}
+}
+
+// unmark the span of memory at v of length n bytes.
+void
+runtime·unmarkspan(void *v, uintptr n)
+{
+	uintptr *p, *b, off;
+
+	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markspan: bad pointer");
+
+	p = v;
+	off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
+	if(off % wordsPerBitmapWord != 0)
+		runtime·throw("markspan: unaligned pointer");
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	n /= PtrSize;
+	if(n%wordsPerBitmapWord != 0)
+		runtime·throw("unmarkspan: unaligned length");
+	n /= wordsPerBitmapWord;
+	while(n-- > 0)
+		*b-- = 0;
+}
+
+bool
+runtime·blockspecial(void *v)
+{
+	uintptr *b, off, shift;
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+
+	return (*b & (bitSpecial<<shift)) != 0;
+}
+
+void
+runtime·setblockspecial(void *v)
+{
+	uintptr *b, off, shift;
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
+	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+
+	*b |= bitSpecial<<shift;
+}
+ 
+void
+runtime·MHeap_MapBits(MHeap *h)
+{
+	// Caller has added extra mappings to the arena.
+	// Add extra mappings of bitmap words as needed.
+	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+	enum {
+		bitmapChunk = 8192
+	};
+	uintptr n;
+	
+	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
+	n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
+	if(h->bitmap_mapped >= n)
+		return;
+
+	runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped);
+	h->bitmap_mapped = n;
+}
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index 0c9ac0a..8061b7c 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -180,7 +180,9 @@
 	// Allocate a multiple of 64kB (16 pages).
 	npage = (npage+15)&~15;
 	ask = npage<<PageShift;
-	if(ask < HeapAllocChunk)
+	if(ask > h->arena_end - h->arena_used)
+		return false;
+	if(ask < HeapAllocChunk && HeapAllocChunk <= h->arena_end - h->arena_used)
 		ask = HeapAllocChunk;
 
 	v = runtime·MHeap_SysAlloc(h, ask);
@@ -194,11 +196,6 @@
 	}
 	mstats.heap_sys += ask;
 
-	if((byte*)v < h->arena_start || h->arena_start == nil)
-		h->arena_start = v;
-	if((byte*)v+ask > h->arena_end)
-		h->arena_end = (byte*)v+ask;
-
 	// Create a fake "in use" span and free it, so that the
 	// right coalescing happens.
 	s = runtime·FixAlloc_Alloc(&h->spanalloc);
@@ -370,10 +367,14 @@
 void
 runtime·MSpanList_Insert(MSpan *list, MSpan *span)
 {
-	if(span->next != nil || span->prev != nil)
+	if(span->next != nil || span->prev != nil) {
+		runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
 		runtime·throw("MSpanList_Insert");
+	}
 	span->next = list->next;
 	span->prev = list;
 	span->next->prev = span;
 	span->prev->next = span;
 }
+
+
diff --git a/src/pkg/runtime/mprof.goc b/src/pkg/runtime/mprof.goc
index f4581e9..aae3d18 100644
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@@ -65,7 +65,7 @@
 		   runtime·mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0)
 			return b;
 
-	b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], RefNoProfiling, 0, 1);
+	b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1);
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime·memmove(b->stk, stk, nstk*sizeof stk[0]);
 	b->hash = h;
@@ -132,7 +132,7 @@
 		if(ah->addr == (addr>>20))
 			goto found;
 
-	ah = runtime·mallocgc(sizeof *ah, RefNoProfiling, 0, 1);
+	ah = runtime·mallocgc(sizeof *ah, FlagNoProfiling, 0, 1);
 	addrmem += sizeof *ah;
 	ah->next = addrhash[h];
 	ah->addr = addr>>20;
@@ -140,7 +140,7 @@
 
 found:
 	if((e = addrfree) == nil) {
-		e = runtime·mallocgc(64*sizeof *e, RefNoProfiling, 0, 0);
+		e = runtime·mallocgc(64*sizeof *e, FlagNoProfiling, 0, 0);
 		addrmem += 64*sizeof *e;
 		for(i=0; i+1<64; i++)
 			e[i].next = &e[i+1];
diff --git a/src/pkg/runtime/msize.c b/src/pkg/runtime/msize.c
index ec85eb3..770ef38 100644
--- a/src/pkg/runtime/msize.c
+++ b/src/pkg/runtime/msize.c
@@ -57,7 +57,7 @@
 void
 runtime·InitSizes(void)
 {
-	int32 align, sizeclass, size, osize, nextsize, n;
+	int32 align, sizeclass, size, nextsize, n;
 	uint32 i;
 	uintptr allocsize, npages;
 
@@ -81,8 +81,7 @@
 		// the leftover is less than 1/8 of the total,
 		// so wasted space is at most 12.5%.
 		allocsize = PageSize;
-		osize = size + RefcountOverhead;
-		while(allocsize%osize > (allocsize/8))
+		while(allocsize%size > allocsize/8)
 			allocsize += PageSize;
 		npages = allocsize >> PageShift;
 
@@ -93,7 +92,7 @@
 		// different sizes.
 		if(sizeclass > 1
 		&& npages == runtime·class_to_allocnpages[sizeclass-1]
-		&& allocsize/osize == allocsize/(runtime·class_to_size[sizeclass-1]+RefcountOverhead)) {
+		&& allocsize/size == allocsize/runtime·class_to_size[sizeclass-1]) {
 			runtime·class_to_size[sizeclass-1] = size;
 			continue;
 		}
diff --git a/src/pkg/runtime/slice.c b/src/pkg/runtime/slice.c
index 0510754..1fee923 100644
--- a/src/pkg/runtime/slice.c
+++ b/src/pkg/runtime/slice.c
@@ -41,7 +41,7 @@
 	ret->cap = cap;
 
 	if((t->elem->kind&KindNoPointers))
-		ret->array = runtime·mallocgc(size, RefNoPointers, 1, 1);
+		ret->array = runtime·mallocgc(size, FlagNoPointers, 1, 1);
 	else
 		ret->array = runtime·mal(size);
 }
diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc
index 916559e..b72aa93 100644
--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -225,7 +225,7 @@
 }
 
 func stringtoslicebyte(s String) (b Slice) {
-	b.array = runtime·mallocgc(s.len, RefNoPointers, 1, 1);
+	b.array = runtime·mallocgc(s.len, FlagNoPointers, 1, 1);
 	b.len = s.len;
 	b.cap = s.len;
 	runtime·mcpy(b.array, s.str, s.len);
@@ -268,7 +268,7 @@
 		n++;
 	}
 
-	b.array = runtime·mallocgc(n*sizeof(r[0]), RefNoPointers, 1, 1);
+	b.array = runtime·mallocgc(n*sizeof(r[0]), FlagNoPointers, 1, 1);
 	b.len = n;
 	b.cap = n;
 	p = s.str;
diff --git a/src/pkg/runtime/windows/mem.c b/src/pkg/runtime/windows/mem.c
index 19d11ce..c523195 100644
--- a/src/pkg/runtime/windows/mem.c
+++ b/src/pkg/runtime/windows/mem.c
@@ -48,7 +48,7 @@
 void*
 runtime·SysReserve(void *v, uintptr n)
 {
-	return runtime·stdcall(runtime·VirtualAlloc, 4, v, n, MEM_RESERVE, 0);
+	return runtime·stdcall(runtime·VirtualAlloc, 4, v, n, MEM_RESERVE, PAGE_EXECUTE_READWRITE);
 }
 
 void