malloc bug fixes.

use malloc by default.
free stacks.

R=r
DELTA=424  (333 added, 29 deleted, 62 changed)
OCL=21553
CL=21584
diff --git a/src/runtime/malloc.c b/src/runtime/malloc.c
index 6246fa9..744e122 100644
--- a/src/runtime/malloc.c
+++ b/src/runtime/malloc.c
@@ -25,6 +25,10 @@
 	MSpan *s;
 	void *v;
 
+	if(m->mallocing)
+		throw("malloc - deadlock");
+	m->mallocing = 1;
+
 	if(size == 0)
 		size = 1;
 
@@ -35,22 +39,24 @@
 		c = m->mcache;
 		v = MCache_Alloc(c, sizeclass, size);
 		if(v == nil)
-			return nil;
+			throw("out of memory");
 		mstats.alloc += size;
-		return v;
+	} else {
+		// TODO(rsc): Report tracebacks for very large allocations.
+
+		// Allocate directly from heap.
+		npages = size >> PageShift;
+		if((size & PageMask) != 0)
+			npages++;
+		s = MHeap_Alloc(&mheap, npages, 0);
+		if(s == nil)
+			throw("out of memory");
+		mstats.alloc += npages<<PageShift;
+		v = (void*)(s->start << PageShift);
 	}
 
-	// TODO(rsc): Report tracebacks for very large allocations.
-
-	// Allocate directly from heap.
-	npages = size >> PageShift;
-	if((size & PageMask) != 0)
-		npages++;
-	s = MHeap_Alloc(&mheap, npages, 0);
-	if(s == nil)
-		return nil;
-	mstats.alloc += npages<<PageShift;
-	return (void*)(s->start << PageShift);
+	m->mallocing = 0;
+	return v;
 }
 
 // Free the object whose base pointer is v.
@@ -89,6 +95,34 @@
 	MCache_Free(c, v, sizeclass, size);
 }
 
+void
+mlookup(void *v, byte **base, uintptr *size)
+{
+	uintptr n, off;
+	byte *p;
+	MSpan *s;
+
+	s = MHeap_Lookup(&mheap, (uintptr)v>>PageShift);
+	if(s == nil) {
+		*base = nil;
+		*size = 0;
+		return;
+	}
+
+	p = (byte*)((uintptr)s->start<<PageShift);
+	if(s->sizeclass == 0) {
+		// Large object.
+		*base = p;
+		*size = s->npages<<PageShift;
+		return;
+	}
+
+	n = class_to_size[s->sizeclass];
+	off = ((byte*)v - p)/n * n;
+	*base = p+off;
+	*size = n;
+}
+
 MCache*
 allocmcache(void)
 {
@@ -144,6 +178,80 @@
 	// TODO(rsc): call munmap
 }
 
+// Runtime stubs.
+
+extern void *oldmal(uint32);
+
+void*
+mal(uint32 n)
+{
+//return oldmal(n);
+	void *v;
+
+	v = malloc(n);
+
+	if(0) {
+		byte *p;
+		int32 i;
+		p = v;
+		for(i=0; i<n; i++) {
+			if(p[i] != 0) {
+				printf("mal %d => %p: byte %d is non-zero\n", n, v, i);
+				throw("mal");
+			}
+		}
+	}
+
+//printf("mal %d %p\n", n, v);  // |checkmal to check for overlapping returns.
+	return v;
+}
+
+// Stack allocator uses malloc/free most of the time,
+// but if we're in the middle of malloc and need stack,
+// we have to do something else to avoid deadlock.
+// In that case, we fall back on a fixed-size free-list
+// allocator, assuming that inside malloc all the stack
+// frames are small, so that all the stack allocations
+// will be a single size, the minimum (right now, 5k).
+struct {
+	Lock;
+	FixAlloc;
+} stacks;
+
+void*
+stackalloc(uint32 n)
+{
+	void *v;
+
+//return oldmal(n);
+	if(m->mallocing) {
+		lock(&stacks);
+		if(stacks.size == 0)
+			FixAlloc_Init(&stacks, n, SysAlloc);
+		if(stacks.size != n) {
+			printf("stackalloc: in malloc, size=%D want %d", stacks.size, n);
+			throw("stackalloc");
+		}
+		v = FixAlloc_Alloc(&stacks);
+		unlock(&stacks);
+		return v;
+	}
+	return malloc(n);
+}
+
+void
+stackfree(void *v)
+{
+//return;
+
+	if(m->mallocing) {
+		lock(&stacks);
+		FixAlloc_Free(&stacks, v);
+		unlock(&stacks);
+		return;
+	}
+	free(v);
+}
 
 // Go function stubs.
 
@@ -161,9 +269,14 @@
 }
 
 void
+malloc·Lookup(byte *p, byte *base, uintptr size)
+{
+	mlookup(p, &base, &size);
+}
+
+void
 malloc·GetStats(MStats *s)
 {
 	s = &mstats;
 	FLUSH(&s);
 }
-
diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h
index e2a4af9..9c71e63 100644
--- a/src/runtime/malloc.h
+++ b/src/runtime/malloc.h
@@ -62,7 +62,7 @@
 //	4. If the heap has too much memory, return some to the
 //	   operating system.
 //
-//	TODO(rsc): Steps 2, 3, 4 are not implemented.
+//	TODO(rsc): Step 4 is not implemented.
 //
 // Allocating and freeing a large object uses the page heap
 // directly, bypassing the MCache and MCentral free lists.
@@ -79,6 +79,7 @@
 typedef struct MHeapMapCache	MHeapMapCache;
 typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
+typedef struct MLink	MLink;
 
 enum
 {
@@ -102,6 +103,12 @@
 };
 
 
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+struct MLink
+{
+	MLink *next;
+};
+
 // SysAlloc obtains a large chunk of memory from the operating system,
 // typically on the order of a hundred kilobytes or a megabyte.
 //
@@ -129,7 +136,7 @@
 {
 	uintptr size;
 	void *(*alloc)(uintptr);
-	void *list;
+	MLink *list;
 	byte *chunk;
 	uint32 nchunk;
 };
@@ -146,6 +153,7 @@
 {
 	uint64	alloc;
 	uint64	sys;
+	uint64	stacks;
 };
 extern MStats mstats;
 
@@ -175,8 +183,9 @@
 typedef struct MCacheList MCacheList;
 struct MCacheList
 {
-	void *list;
+	MLink *list;
 	uint32 nlist;
+	uint32 nlistmin;
 };
 
 struct MCache
@@ -230,8 +239,8 @@
 };
 
 void	MCentral_Init(MCentral *c, int32 sizeclass);
-int32	MCentral_AllocList(MCentral *c, int32 n, void **start, void **end);
-void	MCentral_FreeList(MCentral *c, int32 n, void *start, void *end);
+int32	MCentral_AllocList(MCentral *c, int32 n, MLink **first);
+void	MCentral_FreeList(MCentral *c, int32 n, MLink *first);
 
 
 // Free(v) must be able to determine the MSpan containing v.
diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c
index 01a718a..ae25940 100644
--- a/src/runtime/mcache.c
+++ b/src/runtime/mcache.c
@@ -13,7 +13,7 @@
 MCache_Alloc(MCache *c, int32 sizeclass, uintptr size)
 {
 	MCacheList *l;
-	void *v, *start, *end;
+	MLink *first, *v;
 	int32 n;
 
 	// Allocate from list.
@@ -21,41 +21,85 @@
 	if(l->list == nil) {
 		// Replenish using central lists.
 		n = MCentral_AllocList(&mheap.central[sizeclass],
-			class_to_transfercount[sizeclass], &start, &end);
-		if(n == 0)
-			return nil;
-		l->list = start;
+			class_to_transfercount[sizeclass], &first);
+		l->list = first;
 		l->nlist = n;
 		c->size += n*size;
 	}
 	v = l->list;
-	l->list = *(void**)v;
+	l->list = v->next;
 	l->nlist--;
+	if(l->nlist < l->nlistmin)
+		l->nlistmin = l->nlist;
 	c->size -= size;
 
 	// v is zeroed except for the link pointer
 	// that we used above; zero that.
-	*(void**)v = nil;
+	v->next = nil;
 	return v;
 }
 
-void
-MCache_Free(MCache *c, void *p, int32 sizeclass, uintptr size)
+// Take n elements off l and return them to the central free list.
+static void
+ReleaseN(MCache *c, MCacheList *l, int32 n, int32 sizeclass)
 {
+	MLink *first, **lp;
+	int32 i;
+
+	// Cut off first n elements.
+	first = l->list;
+	lp = &l->list;
+	for(i=0; i<n; i++)
+		lp = &(*lp)->next;
+	l->list = *lp;
+	*lp = nil;
+	l->nlist -= n;
+	if(l->nlist < l->nlistmin)
+		l->nlistmin = l->nlist;
+	c->size -= n*class_to_size[sizeclass];
+
+	// Return them to central free list.
+	MCentral_FreeList(&mheap.central[sizeclass], n, first);
+}
+
+void
+MCache_Free(MCache *c, void *v, int32 sizeclass, uintptr size)
+{
+	int32 i, n;
 	MCacheList *l;
+	MLink *p;
 
 	// Put back on list.
 	l = &c->list[sizeclass];
-	*(void**)p = l->list;
+	p = v;
+	p->next = l->list;
 	l->list = p;
 	l->nlist++;
 	c->size += size;
 
 	if(l->nlist >= MaxMCacheListLen) {
-		// TODO(rsc): Release to central cache.
+		// Release a chunk back.
+		ReleaseN(c, l, class_to_transfercount[sizeclass], sizeclass);
 	}
+
 	if(c->size >= MaxMCacheSize) {
-		// TODO(rsc): Scavenge.
+		// Scavenge.
+		for(i=0; i<NumSizeClasses; i++) {
+			l = &c->list[i];
+			n = l->nlistmin;
+
+			// n is the minimum number of elements we've seen on
+			// the list since the last scavenge.  If n > 0, it means that
+			// we could have gotten by with n fewer elements
+			// without needing to consult the central free list.
+			// Move toward that situation by releasing n/2 of them.
+			if(n > 0) {
+				if(n > 1)
+					n /= 2;
+				ReleaseN(c, l, n, i);
+			}
+			l->nlistmin = l->nlist;
+		}
 	}
 }
 
diff --git a/src/runtime/mcentral.c b/src/runtime/mcentral.c
index 775ccb3..badf68e 100644
--- a/src/runtime/mcentral.c
+++ b/src/runtime/mcentral.c
@@ -35,42 +35,35 @@
 // The objects are linked together by their first words.
 // On return, *pstart points at the first object and *pend at the last.
 int32
-MCentral_AllocList(MCentral *c, int32 n, void **pstart, void **pend)
+MCentral_AllocList(MCentral *c, int32 n, MLink **pfirst)
 {
-	void *start, *end, *v;
+	MLink *first, *last, *v;
 	int32 i;
 
-	*pstart = nil;
-	*pend = nil;
 
 	lock(c);
-
 	// Replenish central list if empty.
 	if(MSpanList_IsEmpty(&c->nonempty)) {
 		if(!MCentral_Grow(c)) {
 			unlock(c);
+			*pfirst = nil;
 			return 0;
 		}
 	}
 
 	// Copy from list, up to n.
-	start = nil;
-	end = nil;
-	for(i=0; i<n; i++) {
-		v = MCentral_Alloc(c);
-		if(v == nil)
-			break;
-		if(start == nil)
-			start = v;
-		else
-			*(void**)end = v;
-		end = v;
+	// First one is guaranteed to work, because we just grew the list.
+	first = MCentral_Alloc(c);
+	last = first;
+	for(i=1; i<n && (v = MCentral_Alloc(c)) != nil; i++) {
+		last->next = v;
+		last = v;
 	}
+	last->next = nil;
 	c->nfree -= i;
 
 	unlock(c);
-	*pstart = start;
-	*pend = end;
+	*pfirst = first;
 	return i;
 }
 
@@ -79,18 +72,18 @@
 MCentral_Alloc(MCentral *c)
 {
 	MSpan *s;
-	void *v;
+	MLink *v;
 
 	if(MSpanList_IsEmpty(&c->nonempty))
 		return nil;
 	s = c->nonempty.next;
+	s->ref++;
 	v = s->freelist;
-	s->freelist = *(void**)v;
+	s->freelist = v->next;
 	if(s->freelist == nil) {
 		MSpanList_Remove(s);
 		MSpanList_Insert(&c->empty, s);
 	}
-	s->ref++;
 	return v;
 }
 
@@ -99,19 +92,18 @@
 // The objects are linked together by their first words.
 // On return, *pstart points at the first object and *pend at the last.
 void
-MCentral_FreeList(MCentral *c, int32 n, void *start, void *end)
+MCentral_FreeList(MCentral *c, int32 n, void *start)
 {
-	void *v, *next;
+	MLink *v, *next;
 
-	// Assume *(void**)end = nil marks end of list.
+	// Assume next == nil marks end of list.
 	// n and end would be useful if we implemented
 	// the transfer cache optimization in the TODO above.
 	USED(n);
-	USED(end);
 
 	lock(c);
 	for(v=start; v; v=next) {
-		next = *(void**)v;
+		next = v->next;
 		MCentral_Free(c, v);
 	}
 	unlock(c);
@@ -122,11 +114,12 @@
 MCentral_Free(MCentral *c, void *v)
 {
 	MSpan *s;
-	PageID p;
+	PageID page;
+	MLink *p, *next;
 
 	// Find span for v.
-	p = (uintptr)v >> PageShift;
-	s = MHeap_Lookup(&mheap, p);
+	page = (uintptr)v >> PageShift;
+	s = MHeap_Lookup(&mheap, page);
 	if(s == nil || s->ref == 0)
 		throw("invalid free");
 
@@ -137,13 +130,21 @@
 	}
 
 	// Add v back to s's free list.
-	*(void**)v = s->freelist;
-	s->freelist = v;
+	p = v;
+	p->next = s->freelist;
+	s->freelist = p;
 	c->nfree++;
 
 	// If s is completely freed, return it to the heap.
 	if(--s->ref == 0) {
 		MSpanList_Remove(s);
+		// Freed blocks are zeroed except for the link pointer.
+		// Zero the link pointers so that the page is all zero.
+		for(p=s->freelist; p; p=next) {
+			next = p->next;
+			p->next = nil;
+		}
+		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / class_to_size[c->sizeclass];
 		unlock(c);
 		MHeap_Free(&mheap, s);
@@ -157,7 +158,7 @@
 MCentral_Grow(MCentral *c)
 {
 	int32 n, npages, size;
-	void **tail;
+	MLink **tailp, *v;
 	byte *p, *end;
 	MSpan *s;
 
@@ -171,17 +172,18 @@
 	}
 
 	// Carve span into sequence of blocks.
-	tail = &s->freelist;
+	tailp = &s->freelist;
 	p = (byte*)(s->start << PageShift);
 	end = p + (npages << PageShift);
 	size = class_to_size[c->sizeclass];
 	n = 0;
 	for(; p + size <= end; p += size) {
-		*tail = p;
-		tail = (void**)p;
+		v = (MLink*)p;
+		*tailp = v;
+		tailp = &v->next;
 		n++;
 	}
-	*tail = nil;
+	*tailp = nil;
 
 	lock(c);
 	c->nfree += n;
diff --git a/src/runtime/mem.c b/src/runtime/mem.c
index 0db941e..8e7a472 100644
--- a/src/runtime/mem.c
+++ b/src/runtime/mem.c
@@ -23,17 +23,6 @@
 	MAP_ANON	= 0x1000,	// not on Linux - TODO(rsc)
 };
 
-void*
-stackalloc(uint32 n)
-{
-	return mal(n);
-}
-
-void
-stackfree(void*)
-{
-}
-
 // Convenient wrapper around mmap.
 static void*
 brk(uint32 n)
@@ -51,7 +40,7 @@
 // right here?"  The answer is yes unless we're in the middle of
 // editing the malloc state in m->mem.
 void*
-mal(uint32 n)
+oldmal(uint32 n)
 {
 	byte* v;
 
diff --git a/src/runtime/mheap.c b/src/runtime/mheap.c
index 9c6de16..427d110 100644
--- a/src/runtime/mheap.c
+++ b/src/runtime/mheap.c
@@ -98,9 +98,20 @@
 	// No matter what, cache span info, because gc needs to be
 	// able to map interior pointer to containing span.
 	s->sizeclass = sizeclass;
-	for(n=0; n<npage; n++) {
+	for(n=0; n<npage; n++)
 		MHeapMap_Set(&h->map, s->start+n, s);
-		if(sizeclass != 0)
+	if(sizeclass == 0) {
+		uintptr tmp;
+
+		// If there are entries for this span, invalidate them,
+		// but don't blow out cache entries about other spans.
+		for(n=0; n<npage; n++)
+			if(MHeapMapCache_GET(&h->mapcache, s->start+n, tmp) != 0)
+				MHeapMapCache_SET(&h->mapcache, s->start+n, 0);
+	} else {
+		// Save cache entries for this span.
+		// If there's a size class, there aren't that many pages.
+		for(n=0; n<npage; n++)
 			MHeapMapCache_SET(&h->mapcache, s->start+n, sizeclass);
 	}
 
@@ -168,6 +179,8 @@
 		return false;
 	}
 
+	// Create a fake "in use" span and free it, so that the
+	// right coalescing happens.
 	s = FixAlloc_Alloc(&h->spanalloc);
 	MSpan_Init(s, (uintptr)v>>PageShift, ask>>PageShift);
 	MHeapMap_Set(&h->map, s->start, s);
@@ -198,8 +211,10 @@
 {
 	MSpan *t;
 
-	if(s->state != MSpanInUse || s->ref != 0)
+	if(s->state != MSpanInUse || s->ref != 0) {
+		printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d\n", s, s->start<<PageShift, s->state, s->ref);
 		throw("MHeap_FreeLocked - invalid free");
+	}
 	s->state = MSpanFree;
 	MSpanList_Remove(s);
 
diff --git a/src/runtime/proc.c b/src/runtime/proc.c
index 2d9ce77..0158156 100644
--- a/src/runtime/proc.c
+++ b/src/runtime/proc.c
@@ -97,6 +97,10 @@
 	byte *p;
 
 	mallocinit();
+	
+	// Allocate internal symbol table representation now,
+	// so that we don't need to call malloc when we crash.
+	findfunc(0);
 
 	sched.gomaxprocs = 1;
 	p = getenv("GOMAXPROCS");
@@ -440,7 +444,7 @@
 			notewakeup(&m->havenextg);
 		}else{
 			m = mal(sizeof(M));
-			m->g0 = malg(1024);
+			m->g0 = malg(8192);
 			m->nextg = g;
 			m->id = sched.mcount++;
 			if(debug) {
diff --git a/src/runtime/rt0_amd64.s b/src/runtime/rt0_amd64.s
index 61a768f..61f9255 100644
--- a/src/runtime/rt0_amd64.s
+++ b/src/runtime/rt0_amd64.s
@@ -22,7 +22,7 @@
 
 	// create istack out of the given (operating system) stack
 
-	LEAQ	(-1024+104)(SP), AX
+	LEAQ	(-8192+104)(SP), AX
 	MOVQ	AX, 0(R15)		// 0(R15) is stack limit (w 104b guard)
 	MOVQ	SP, 8(R15)		// 8(R15) is base
 
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index a8d40f8..fc4e5ba 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -76,10 +76,6 @@
 	true	= 1,
 	false	= 0,
 };
-enum
-{
-	SmallFreeClasses = 168,	// number of small free lists in malloc
-};
 
 /*
  * structures
@@ -158,6 +154,7 @@
 	int32	siz1;
 	int32	siz2;
 	int32	id;
+	int32	mallocing;
 	Note	havenextg;
 	G*	nextg;
 	M*	schedlink;