cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 1bb709c..4015075 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -51,7 +51,7 @@
}
// ptrmask for an allocation containing a single pointer.
-var oneptr = [...]uint8{typePointer}
+var oneptrmask = [...]uint8{1}
//go:nowritebarrier
func markroot(desc *parfor, i uint32) {
@@ -98,9 +98,9 @@
// A finalizer can be set for an inner byte of an object, find object beginning.
p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
if gcphase != _GCscan {
- scanblock(p, s.elemsize, nil, &gcw) // scanned during mark phase
+ scanobject(p, &gcw) // scanned during mark termination
}
- scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0], &gcw)
+ scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptrmask[0], &gcw)
}
}
@@ -383,7 +383,7 @@
throw("scanframe: bad symbol table")
}
bv := stackmapdata(stkmap, pcdata)
- size = (uintptr(bv.n) / typeBitsWidth) * ptrSize
+ size = uintptr(bv.n) * ptrSize
scanblock(frame.varp-size, size, bv.bytedata, gcw)
}
@@ -405,7 +405,7 @@
}
bv = stackmapdata(stkmap, pcdata)
}
- scanblock(frame.argp, uintptr(bv.n)/typeBitsWidth*ptrSize, bv.bytedata, gcw)
+ scanblock(frame.argp, uintptr(bv.n)*ptrSize, bv.bytedata, gcw)
}
}
@@ -447,7 +447,7 @@
// out of the wbuf passed in + a single object placed
// into an empty wbuf in scanobject so there could be
// a performance hit as we keep fetching fresh wbufs.
- scanobject(b, 0, nil, gcw)
+ scanobject(b, gcw)
// Flush background scan work credit to the global
// account if we've accumulated enough locally so
@@ -499,7 +499,7 @@
// No more work
break
}
- scanobject(b, 0, nil, gcw)
+ scanobject(b, gcw)
// Flush background scan work credit to the global
// account if we've accumulated enough locally so
@@ -534,12 +534,12 @@
if b == 0 {
return
}
- scanobject(b, 0, nil, gcw)
+ scanobject(b, gcw)
}
}
-// scanblock scans b as scanobject would.
-// If the gcphase is GCscan, scanblock performs additional checks.
+// scanblock scans b as scanobject would, but using an explicit
+// pointer bitmap instead of the heap bitmap.
//go:nowritebarrier
func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
// Use local copies of original parameters, so that a stack trace
@@ -548,59 +548,69 @@
b := b0
n := n0
- // ptrmask can have 2 possible values:
- // 1. nil - obtain pointer mask from GC bitmap.
- // 2. pointer to a compact mask (for stacks and data).
+ arena_start := mheap_.arena_start
+ arena_used := mheap_.arena_used
+ scanWork := int64(0)
- scanobject(b, n, ptrmask, gcw)
- if gcphase == _GCscan {
- if inheap(b) && ptrmask == nil {
- // b is in heap, we are in GCscan so there should be a ptrmask.
- throw("scanblock: In GCscan phase and inheap is true.")
+ for i := uintptr(0); i < n; {
+ // Find bits for the next word.
+ bits := uint32(*addb(ptrmask, i/(ptrSize*8)))
+ if bits == 0 {
+ i += ptrSize * 8
+ continue
+ }
+ for j := 0; j < 8 && i < n; j++ {
+ if bits&1 != 0 {
+ // Same work as in scanobject; see comments there.
+ obj := *(*uintptr)(unsafe.Pointer(b + i))
+ scanWork++
+ if obj != 0 && arena_start <= obj && obj < arena_used {
+ if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && useCheckmark {
+ checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
+ }
+ if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
+ greyobject(obj, b, i, hbits, span, gcw)
+ }
+ }
+ }
+ bits >>= 1
+ i += ptrSize
}
}
+
+ gcw.bytesMarked += uint64(n)
+ gcw.scanWork += scanWork
}
-// scanobject scans memory starting at b, adding pointers to gcw.
-// If ptrmask != nil, it specifies the pointer mask starting at b and
-// n specifies the number of bytes to scan.
-// If ptrmask == nil, b must point to the beginning of a heap object
-// and scanobject consults the GC bitmap for the pointer mask and the
-// spans for the size of the object (it ignores n).
+// scanobject scans the object starting at b, adding pointers to gcw.
+// b must point to the beginning of a heap object; scanobject consults
+// the GC bitmap for the pointer mask and the spans for the size of the
+// object (it ignores n).
//go:nowritebarrier
-func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWork) {
+func scanobject(b uintptr, gcw *gcWork) {
arena_start := mheap_.arena_start
arena_used := mheap_.arena_used
scanWork := int64(0)
// Find bits of the beginning of the object.
- var hbits heapBits
-
- if ptrmask == nil {
- // b must point to the beginning of a heap object, so
- // we can get its bits and span directly.
- hbits = heapBitsForAddr(b)
- s := spanOfUnchecked(b)
- n = s.elemsize
- if n == 0 {
- throw("scanobject n == 0")
- }
+ // b must point to the beginning of a heap object, so
+ // we can get its bits and span directly.
+ hbits := heapBitsForAddr(b)
+ s := spanOfUnchecked(b)
+ n := s.elemsize
+ if n == 0 {
+ throw("scanobject n == 0")
}
+
for i := uintptr(0); i < n; i += ptrSize {
// Find bits for this word.
- var bits uintptr
- if ptrmask != nil {
- // dense mask (stack or data)
- bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * typeBitsWidth)) & typeMask
- } else {
- if i != 0 {
- // Avoid needless hbits.next() on last iteration.
- hbits = hbits.next()
- }
- bits = uintptr(hbits.typeBits())
- if bits == typeDead {
- break // no more pointers in this object
- }
+ if i != 0 {
+ // Avoid needless hbits.next() on last iteration.
+ hbits = hbits.next()
+ }
+ bits := uintptr(hbits.typeBits())
+ if bits == typeDead {
+ break // no more pointers in this object
}
if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
@@ -608,10 +618,13 @@
}
if bits&typePointer != typePointer {
- print("gc useCheckmark=", useCheckmark, " b=", hex(b), " ptrmask=", ptrmask, "\n")
+ print("gc useCheckmark=", useCheckmark, " b=", hex(b), "\n")
throw("unexpected garbage collection bits")
}
+ // Work here is duplicated in scanblock.
+ // If you make changes here, make changes there too.
+
obj := *(*uintptr)(unsafe.Pointer(b + i))
// Track the scan work performed as a way to estimate
@@ -626,17 +639,15 @@
// At this point we have extracted the next potential pointer.
// Check if it points into heap.
- if obj == 0 || obj < arena_start || obj >= arena_used {
- continue
- }
+ if obj != 0 && arena_start <= obj && obj < arena_used {
+ if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && useCheckmark {
+ checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
+ }
- if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && useCheckmark {
- checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
- }
-
- // Mark the object.
- if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
- greyobject(obj, b, i, hbits, span, gcw)
+ // Mark the object.
+ if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
+ greyobject(obj, b, i, hbits, span, gcw)
+ }
}
}
gcw.bytesMarked += uint64(n)