runtime: convert markallocated from C to Go
benchmark old ns/op new ns/op delta
BenchmarkMalloc8 28.7 22.4 -21.95%
BenchmarkMalloc16 44.8 33.8 -24.55%
BenchmarkMallocTypeInfo8 49.0 32.9 -32.86%
BenchmarkMallocTypeInfo16 46.7 35.8 -23.34%
BenchmarkMallocLargeStruct 907 901 -0.66%
BenchmarkGobDecode 13235542 12036851 -9.06%
BenchmarkGobEncode 10639699 9539155 -10.34%
BenchmarkJSONEncode 25193036 21898922 -13.08%
BenchmarkJSONDecode 96104044 89464904 -6.91%
Fixes #8452.
LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
index e7f2388..73dc9f2 100644
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@@ -9,6 +9,8 @@
)
const (
+ debugMalloc = false
+
flagNoScan = 1 << 0 // GC doesn't have to scan object
flagNoProfiling = 1 << 1 // must not profile
flagNoZero = 1 << 2 // don't zero memory
@@ -29,6 +31,22 @@
pageShift = 13
pageSize = 1 << pageShift
pageMask = pageSize - 1
+
+ wordsPerBitmapWord = ptrSize * 8 / 4
+ gcBits = 4
+ bitsPerPointer = 2
+ bitsMask = 1<<bitsPerPointer - 1
+ pointersPerByte = 8 / bitsPerPointer
+ bitPtrMask = bitsMask << 2
+ maxGCMask = 0 // disabled because wastes several bytes of memory
+ bitsDead = 0
+ bitsPointer = 2
+
+ bitMiddle = 0
+ bitBoundary = 1
+ bitAllocated = 2
+ bitMarked = 3
+ bitMask = bitMiddle | bitBoundary | bitAllocated | bitMarked
)
// All zero-sized allocations return a pointer to this byte.
@@ -168,14 +186,112 @@
size = uintptr(s.elemsize)
}
- // TODO: write markallocated in Go
- mp.ptrarg[0] = x
- mp.scalararg[0] = uint(size)
- mp.scalararg[1] = uint(size0)
- mp.ptrarg[1] = unsafe.Pointer(typ)
- mp.scalararg[2] = uint(flags & flagNoScan)
- onM(&markallocated_m)
+ // From here till marked label marking the object as allocated
+ // and storing type info in the GC bitmap.
+ arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+ off := (uintptr(x) - arena_start) / ptrSize
+ xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+ shift := (off % wordsPerBitmapWord) * gcBits
+ if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
+ gothrow("bad bits in markallocated")
+ }
+ var ti, te uintptr
+ var ptrmask *uint8
+ if flags&flagNoScan != 0 {
+ // bitsDead in the first quadruple means don't scan.
+ if size == ptrSize {
+ *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
+ } else {
+ xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+ *xbitsb = bitAllocated + (bitsDead << 2)
+ }
+ goto marked
+ }
+ if size == ptrSize {
+ // It's one word and it has pointers, it must be a pointer.
+ *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
+ goto marked
+ }
+ if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
+ if typ.kind&kindGCProg != 0 {
+ nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+ masksize := nptr
+ if masksize%2 != 0 {
+ masksize *= 2 // repeated
+ }
+ masksize = masksize * pointersPerByte / 8 // 4 bits per word
+ masksize++ // unroll flag in the beginning
+ if masksize > maxGCMask && typ.gc[1] != 0 {
+ // If the mask is too large, unroll the program directly
+ // into the GC bitmap. It's 7 times slower than copying
+ // from the pre-unrolled mask, but saves 1/16 of type size
+ // memory for the mask.
+ mp.ptrarg[0] = x
+ mp.ptrarg[1] = unsafe.Pointer(typ)
+ mp.scalararg[0] = uint(size)
+ mp.scalararg[1] = uint(size0)
+ onM(&unrollgcproginplace_m)
+ goto marked
+ }
+ ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+ // Check whether the program is already unrolled.
+ if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
+ mp.ptrarg[0] = unsafe.Pointer(typ)
+ onM(&unrollgcprog_m)
+ }
+ ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+ } else {
+ ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
+ }
+ if size == 2*ptrSize {
+ xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+ *xbitsb = *ptrmask | bitAllocated
+ goto marked
+ }
+ te = uintptr(typ.size) / ptrSize
+ // If the type occupies odd number of words, its mask is repeated.
+ if te%2 == 0 {
+ te /= 2
+ }
+ }
+ if size == 2*ptrSize {
+ xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+ *xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
+ goto marked
+ }
+ // Copy pointer bitmask into the bitmap.
+ for i := uintptr(0); i < size0; i += 2 * ptrSize {
+ v := uint8((bitsPointer << 2) | (bitsPointer << 6))
+ if ptrmask != nil {
+ v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
+ ti++
+ if ti == te {
+ ti = 0
+ }
+ }
+ if i == 0 {
+ v |= bitAllocated
+ }
+ if i+ptrSize == size0 {
+ v &= ^uint8(bitPtrMask << 4)
+ }
+
+ off := (uintptr(x) + i - arena_start) / ptrSize
+ xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+ shift := (off % wordsPerBitmapWord) * gcBits
+ xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+ *xbitsb = v
+ }
+ if size0%(2*ptrSize) == 0 && size0 < size {
+ // Mark the word after last object's word as bitsDead.
+ off := (uintptr(x) + size0 - arena_start) / ptrSize
+ xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+ shift := (off % wordsPerBitmapWord) * gcBits
+ xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+ *xbitsb = bitsDead << 2
+ }
+marked:
mp.mallocing = 0
if raceenabled {
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index 43feef7..4b16c55 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -517,7 +517,6 @@
int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
void runtime·gc(int32 force);
uintptr runtime·sweepone(void);
-void runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
void runtime·unmarkspan(void *v, uintptr size);
void runtime·purgecachedstats(MCache*);
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 4637d68..8998a87 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -1741,7 +1741,7 @@
return res;
}
-// Recursively GC program in prog.
+// Recursively unrolls GC program in prog.
// mask is where to store the result.
// ppos is a pointer to position in mask, in bits.
// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
@@ -1833,11 +1833,20 @@
return mask;
}
-static void
-unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
+void
+runtime·unrollgcproginplace_m(void)
{
- uintptr *b, off, shift, pos;
+ uintptr size, size0, *b, off, shift, pos;
byte *arena_start, *prog;
+ Type *typ;
+ void *v;
+
+ v = g->m->ptrarg[0];
+ typ = g->m->ptrarg[1];
+ size = g->m->scalararg[0];
+ size0 = g->m->scalararg[1];
+ g->m->ptrarg[0] = nil;
+ g->m->ptrarg[1] = nil;
pos = 0;
prog = (byte*)typ->gc[1];
@@ -1859,14 +1868,18 @@
}
// Unrolls GC program in typ->gc[1] into typ->gc[0]
-static void
-unrollgcprog(Type *typ)
+void
+runtime·unrollgcprog_m(void)
{
static Lock lock;
+ Type *typ;
byte *mask, *prog;
uintptr pos;
uint32 x;
+ typ = g->m->ptrarg[0];
+ g->m->ptrarg[0] = nil;
+
runtime·lock(&lock);
mask = (byte*)typ->gc[0];
if(mask[0] == 0) {
@@ -1887,110 +1900,6 @@
runtime·unlock(&lock);
}
-void
-runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan)
-{
- uintptr *b, off, shift, i, ti, te, nptr, masksize;
- byte *arena_start, x;
- bool *ptrmask;
-
- arena_start = runtime·mheap.arena_start;
- off = (uintptr*)v - (uintptr*)arena_start;
- b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
- shift = (off % wordsPerBitmapWord) * gcBits;
- if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) {
- runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b);
- runtime·throw("bad bits in markallocated");
- }
-
- if(!scan) {
- // BitsDead in the first quadruple means don't scan.
- if(size == PtrSize)
- *b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated+(BitsDead<<2))<<shift);
- else
- ((byte*)b)[shift/8] = bitAllocated+(BitsDead<<2);
- return;
- }
- if(size == PtrSize) {
- // It's one word and it has pointers, it must be a pointer.
- *b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated | (BitsPointer<<2))<<shift);
- return;
- }
- ti = te = 0;
- ptrmask = nil;
- if(typ != nil && (typ->gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) {
- if(typ->kind&KindGCProg) {
- nptr = ROUND(typ->size, PtrSize)/PtrSize;
- masksize = nptr;
- if(masksize%2)
- masksize *= 2; // repeated twice
- masksize = masksize*PointersPerByte/8; // 4 bits per word
- masksize++; // unroll flag in the beginning
- if(masksize > MaxGCMask && typ->gc[1] != 0) {
- // If the mask is too large, unroll the program directly
- // into the GC bitmap. It's 7 times slower than copying
- // from the pre-unrolled mask, but saves 1/16 of type size
- // memory for the mask.
- unrollgcproginplace(v, size, size0, typ);
- return;
- }
- ptrmask = (byte*)typ->gc[0];
- // check whether the program is already unrolled
- if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0)
- unrollgcprog(typ);
- ptrmask++; // skip the unroll flag byte
- } else
- ptrmask = (byte*)&typ->gc[0]; // embed mask
- if(size == 2*PtrSize) {
- ((byte*)b)[shift/8] = ptrmask[0] | bitAllocated;
- return;
- }
- te = typ->size/PtrSize;
- // if the type occupies odd number of words, its mask is repeated twice
- if((te%2) == 0)
- te /= 2;
- }
- if(size == 2*PtrSize) {
- ((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated;
- return;
- }
- // Copy pointer bitmask into the bitmap.
- for(i=0; i<size0; i+=2*PtrSize) {
- x = (BitsPointer<<2) | (BitsPointer<<6);
- if(ptrmask != nil) {
- x = ptrmask[ti++];
- if(ti == te)
- ti = 0;
- }
- off = (uintptr*)((byte*)v + i) - (uintptr*)arena_start;
- b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
- shift = (off % wordsPerBitmapWord) * gcBits;
- if(i == 0)
- x |= bitAllocated;
- if(i+PtrSize == size0)
- x &= ~(bitPtrMask<<4);
- ((byte*)b)[shift/8] = x;
- }
- if(size0 == i && size0 < size) {
- // mark the word after last object's word as BitsDead
- off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
- b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
- shift = (off % wordsPerBitmapWord) * gcBits;
- ((byte*)b)[shift/8] = 0;
- }
-}
-
-void
-runtime·markallocated_m(void)
-{
- M *mp;
-
- mp = g->m;
- runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
- mp->ptrarg[0] = nil;
- mp->ptrarg[1] = nil;
-}
-
// mark the span of memory at v as having n blocks of the given size.
// if leftover is true, there is left over space at the end of the span.
void
diff --git a/src/pkg/runtime/stubs.go b/src/pkg/runtime/stubs.go
index 8a2fc8a9..fee18f0 100644
--- a/src/pkg/runtime/stubs.go
+++ b/src/pkg/runtime/stubs.go
@@ -65,7 +65,9 @@
mprofMalloc_m,
gc_m,
setFinalizer_m,
- markallocated_m mFunction
+ markallocated_m,
+ unrollgcprog_m,
+ unrollgcproginplace_m mFunction
)
// memclr clears n bytes starting at ptr.
@@ -87,7 +89,12 @@
concurrentSweep = true
)
-// in asm_*.s
+// Atomic operations to read/write a pointer.
+// in stubs.goc
+func goatomicloadp(p unsafe.Pointer) unsafe.Pointer // return *p
+func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v
+
+// in stubs.goc
// if *p == x { *p = y; return true } else { return false }, atomically
//go:noescape
func gocas(p *uint32, x uint32, y uint32) bool
diff --git a/src/pkg/runtime/stubs.goc b/src/pkg/runtime/stubs.goc
index c64e73d..42a4bf1 100644
--- a/src/pkg/runtime/stubs.goc
+++ b/src/pkg/runtime/stubs.goc
@@ -49,6 +49,16 @@
}
#pragma textflag NOSPLIT
+func goatomicloadp(p **byte) (v *byte) {
+ v = runtime·atomicloadp(p);
+}
+
+#pragma textflag NOSPLIT
+func goatomicstorep(p **byte, v *byte) {
+ runtime·atomicstorep(p, v);
+}
+
+#pragma textflag NOSPLIT
func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
ret = runtime·cas(p, x, y);
}