runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.

benchmark                             old ns/op    new ns/op    delta
garbage.BenchmarkTree2-8               97100768     71417553  -26.45%
garbage.BenchmarkTree2LastPause-8     970931485    714103692  -26.45%
garbage.BenchmarkTree2Pause-8         469127802    345029253  -26.45%
garbage.BenchmarkParser-8            2880950854   2715456901   -5.74%
garbage.BenchmarkParserLastPause-8    137047399    103336476  -24.60%
garbage.BenchmarkParserPause-8         80686028     58922680  -26.97%

R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index a79c22e..dd268fc 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -140,6 +140,7 @@
 static Workbuf* getfull(Workbuf*);
 static void	putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
+static void	gchelperstart(void);
 
 static struct {
 	uint64	full;  // lock-free list of full blocks
@@ -287,11 +288,12 @@
 {
 	PtrTarget ptrtarget[IntermediateBufferCapacity];
 	Obj obj[IntermediateBufferCapacity];
-	BufferList *next;
+	uint32 busy;
+	byte pad[CacheLineSize];
 };
-static BufferList *bufferList;
+#pragma dataflag 16  // no pointers
+static BufferList bufferList[MaxGcproc];
 
-static Lock lock;
 static Type *itabtype;
 
 static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
@@ -598,23 +600,11 @@
 
 	// Allocate ptrbuf
 	{
-		runtime·lock(&lock);
-
-		if(bufferList == nil) {
-			bufferList = runtime·SysAlloc(sizeof(*bufferList));
-			if(bufferList == nil)
-				runtime·throw("runtime: cannot allocate memory");
-			bufferList->next = nil;
-		}
-		scanbuffers = bufferList;
-		bufferList = bufferList->next;
-
+		scanbuffers = &bufferList[m->helpgc];
 		ptrbuf = &scanbuffers->ptrtarget[0];
 		ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
 		objbuf = &scanbuffers->obj[0];
 		objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj);
-
-		runtime·unlock(&lock);
 	}
 
 	ptrbufpos = ptrbuf;
@@ -1056,11 +1046,7 @@
 		nobj--;
 	}
 
-endscan:
-	runtime·lock(&lock);
-	scanbuffers->next = bufferList;
-	bufferList = scanbuffers;
-	runtime·unlock(&lock);
+endscan:;
 }
 
 // debug_scanblock is the debug copy of scanblock.
@@ -1688,6 +1674,8 @@
 void
 runtime·gchelper(void)
 {
+	gchelperstart();
+
 	// parallel mark for over gc roots
 	runtime·parfordo(work.markfor);
 
@@ -1701,6 +1689,7 @@
 	}
 
 	runtime·parfordo(work.sweepfor);
+	bufferList[m->helpgc].busy = 0;
 	if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
 		runtime·notewakeup(&work.alldone);
 }
@@ -1892,6 +1881,7 @@
 
 	t1 = runtime·nanotime();
 
+	gchelperstart();
 	runtime·parfordo(work.markfor);
 	scanblock(nil, nil, 0, true);
 
@@ -1903,6 +1893,7 @@
 	t2 = runtime·nanotime();
 
 	runtime·parfordo(work.sweepfor);
+	bufferList[m->helpgc].busy = 0;
 	t3 = runtime·nanotime();
 
 	if(work.nproc > 1)
@@ -2044,6 +2035,15 @@
 }
 
 static void
+gchelperstart(void)
+{
+	if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
+		runtime·throw("gchelperstart: bad m->helpgc");
+	if(runtime·xchg(&bufferList[m->helpgc].busy, 1))
+		runtime·throw("gchelperstart: already busy");
+}
+
+static void
 runfinq(void)
 {
 	Finalizer *f;
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index a6ef83b..8d05730 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -332,7 +332,7 @@
 		mp = mget();
 		if(mp == nil)
 			runtime·throw("runtime·gcprocs inconsistency");
-		mp->helpgc = 1;
+		mp->helpgc = n;
 		mp->mcache = runtime·allp[pos]->mcache;
 		pos++;
 		runtime·notewakeup(&mp->park);
@@ -386,7 +386,7 @@
 static void
 mhelpgc(void)
 {
-	m->helpgc = 1;
+	m->helpgc = -1;
 }
 
 void
@@ -485,7 +485,7 @@
 		m->mstartfn();
 
 	if(m->helpgc) {
-		m->helpgc = false;
+		m->helpgc = 0;
 		stopm();
 	} else if(m != &runtime·m0) {
 		acquirep(m->nextp);
@@ -794,8 +794,8 @@
 	runtime·notesleep(&m->park);
 	runtime·noteclear(&m->park);
 	if(m->helpgc) {
-		m->helpgc = 0;
 		runtime·gchelper();
+		m->helpgc = 0;
 		m->mcache = nil;
 		goto retry;
 	}