runtime: faster parallel GC
Use per-thread work buffers instead of global mutex-protected pool. This eliminates contention from parallel scan phase.
benchmark old ns/op new ns/op delta
garbage.BenchmarkTree2-8 97100768 71417553 -26.45%
garbage.BenchmarkTree2LastPause-8 970931485 714103692 -26.45%
garbage.BenchmarkTree2Pause-8 469127802 345029253 -26.45%
garbage.BenchmarkParser-8 2880950854 2715456901 -5.74%
garbage.BenchmarkParserLastPause-8 137047399 103336476 -24.60%
garbage.BenchmarkParserPause-8 80686028 58922680 -26.97%
R=golang-dev, 0xe2.0x9a.0x9b, dave, adg, rsc, iant
CC=golang-dev
https://golang.org/cl/7816044
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index a79c22e..dd268fc 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -140,6 +140,7 @@
static Workbuf* getfull(Workbuf*);
static void putempty(Workbuf*);
static Workbuf* handoff(Workbuf*);
+static void gchelperstart(void);
static struct {
uint64 full; // lock-free list of full blocks
@@ -287,11 +288,12 @@
{
PtrTarget ptrtarget[IntermediateBufferCapacity];
Obj obj[IntermediateBufferCapacity];
- BufferList *next;
+ uint32 busy;
+ byte pad[CacheLineSize];
};
-static BufferList *bufferList;
+#pragma dataflag 16 // no pointers
+static BufferList bufferList[MaxGcproc];
-static Lock lock;
static Type *itabtype;
static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
@@ -598,23 +600,11 @@
// Allocate ptrbuf
{
- runtime·lock(&lock);
-
- if(bufferList == nil) {
- bufferList = runtime·SysAlloc(sizeof(*bufferList));
- if(bufferList == nil)
- runtime·throw("runtime: cannot allocate memory");
- bufferList->next = nil;
- }
- scanbuffers = bufferList;
- bufferList = bufferList->next;
-
+ scanbuffers = &bufferList[m->helpgc];
ptrbuf = &scanbuffers->ptrtarget[0];
ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
objbuf = &scanbuffers->obj[0];
objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj);
-
- runtime·unlock(&lock);
}
ptrbufpos = ptrbuf;
@@ -1056,11 +1046,7 @@
nobj--;
}
-endscan:
- runtime·lock(&lock);
- scanbuffers->next = bufferList;
- bufferList = scanbuffers;
- runtime·unlock(&lock);
+endscan:;
}
// debug_scanblock is the debug copy of scanblock.
@@ -1688,6 +1674,8 @@
void
runtime·gchelper(void)
{
+ gchelperstart();
+
// parallel mark for over gc roots
runtime·parfordo(work.markfor);
@@ -1701,6 +1689,7 @@
}
runtime·parfordo(work.sweepfor);
+ bufferList[m->helpgc].busy = 0;
if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
runtime·notewakeup(&work.alldone);
}
@@ -1892,6 +1881,7 @@
t1 = runtime·nanotime();
+ gchelperstart();
runtime·parfordo(work.markfor);
scanblock(nil, nil, 0, true);
@@ -1903,6 +1893,7 @@
t2 = runtime·nanotime();
runtime·parfordo(work.sweepfor);
+ bufferList[m->helpgc].busy = 0;
t3 = runtime·nanotime();
if(work.nproc > 1)
@@ -2044,6 +2035,15 @@
}
static void
+gchelperstart(void)
+{
+ if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
+ runtime·throw("gchelperstart: bad m->helpgc");
+ if(runtime·xchg(&bufferList[m->helpgc].busy, 1))
+ runtime·throw("gchelperstart: already busy");
+}
+
+static void
runfinq(void)
{
Finalizer *f;
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index a6ef83b..8d05730 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -332,7 +332,7 @@
mp = mget();
if(mp == nil)
runtime·throw("runtime·gcprocs inconsistency");
- mp->helpgc = 1;
+ mp->helpgc = n;
mp->mcache = runtime·allp[pos]->mcache;
pos++;
runtime·notewakeup(&mp->park);
@@ -386,7 +386,7 @@
static void
mhelpgc(void)
{
- m->helpgc = 1;
+ m->helpgc = -1;
}
void
@@ -485,7 +485,7 @@
m->mstartfn();
if(m->helpgc) {
- m->helpgc = false;
+ m->helpgc = 0;
stopm();
} else if(m != &runtime·m0) {
acquirep(m->nextp);
@@ -794,8 +794,8 @@
runtime·notesleep(&m->park);
runtime·noteclear(&m->park);
if(m->helpgc) {
- m->helpgc = 0;
runtime·gchelper();
+ m->helpgc = 0;
m->mcache = nil;
goto retry;
}