sync: smooth out Pool behavior over GC with a victim cache
Currently, every Pool is cleared completely at the start of each GC.
This is a problem for heavy users of Pool because it causes an
allocation spike immediately after Pools are clear, which impacts both
throughput and latency.
This CL fixes this by introducing a victim cache mechanism. Instead of
clearing Pools, the victim cache is dropped and the primary cache is
moved to the victim cache. As a result, in steady-state, there are
(roughly) no new allocations, but if Pool usage drops, objects will
still be collected within two GCs (as opposed to one).
This victim cache approach also improves Pool's impact on GC dynamics.
The current approach causes all objects in Pools to be short lived.
However, if an application is in steady state and is just going to
repopulate its Pools, then these objects impact the live heap size *as
if* they were long lived. Since Pooled objects count as short lived
when computing the GC trigger and goal, but act as long lived objects
in the live heap, this causes GC to trigger too frequently. If Pooled
objects are a non-trivial portion of an application's heap, this
increases the CPU overhead of GC. The victim cache lets Pooled objects
affect the GC trigger and goal as long-lived objects.
This has no impact on Get/Put performance, but substantially reduces
the impact to the Pool user when a GC happens. PoolExpensiveNew
demonstrates this in the substantially reduction in the rate at which
the "New" function is called.
name old time/op new time/op delta
Pool-12 2.21ns ±36% 2.00ns ± 0% ~ (p=0.070 n=19+16)
PoolOverflow-12 587ns ± 1% 583ns ± 1% -0.77% (p=0.000 n=18+18)
PoolSTW-12 5.57µs ± 3% 4.52µs ± 4% -18.82% (p=0.000 n=20+19)
PoolExpensiveNew-12 3.69ms ± 7% 1.25ms ± 5% -66.25% (p=0.000 n=20+19)
name old p50-ns/STW new p50-ns/STW delta
PoolSTW-12 5.48k ± 2% 4.53k ± 2% -17.32% (p=0.000 n=20+20)
name old p95-ns/STW new p95-ns/STW delta
PoolSTW-12 6.69k ± 4% 5.13k ± 3% -23.31% (p=0.000 n=19+18)
name old GCs/op new GCs/op delta
PoolExpensiveNew-12 0.39 ± 1% 0.32 ± 2% -17.95% (p=0.000 n=18+20)
name old New/op new New/op delta
PoolExpensiveNew-12 40.0 ± 6% 12.4 ± 6% -68.91% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20190311.2)
Fixes #22950.
Change-Id: If2e183d948c650417283076aacc20739682cdd70
Reviewed-on: https://go-review.googlesource.com/c/go/+/166961
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
diff --git a/src/sync/pool.go b/src/sync/pool.go
index c447cb7..f58fdd4 100644
--- a/src/sync/pool.go
+++ b/src/sync/pool.go
@@ -47,6 +47,9 @@
local unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal
localSize uintptr // size of the local array
+ victim unsafe.Pointer // local from previous cycle
+ victimSize uintptr // size of victims array
+
// New optionally specifies a function to generate
// a value when Get would otherwise return nil.
// It may not be changed concurrently with calls to Get.
@@ -150,14 +153,39 @@
func (p *Pool) getSlow(pid int) interface{} {
// See the comment in pin regarding ordering of the loads.
size := atomic.LoadUintptr(&p.localSize) // load-acquire
- local := p.local // load-consume
+ locals := p.local // load-consume
// Try to steal one element from other procs.
for i := 0; i < int(size); i++ {
- l := indexLocal(local, (pid+i+1)%int(size))
+ l := indexLocal(locals, (pid+i+1)%int(size))
if x, _ := l.shared.popTail(); x != nil {
return x
}
}
+
+ // Try the victim cache. We do this after attempting to steal
+ // from all primary caches because we want objects in the
+ // victim cache to age out if at all possible.
+ size = atomic.LoadUintptr(&p.victimSize)
+ if uintptr(pid) >= size {
+ return nil
+ }
+ locals = p.victim
+ l := indexLocal(locals, pid)
+ if x := l.private; x != nil {
+ l.private = nil
+ return x
+ }
+ for i := 0; i < int(size); i++ {
+ l := indexLocal(locals, (pid+i)%int(size))
+ if x, _ := l.shared.popTail(); x != nil {
+ return x
+ }
+ }
+
+ // Mark the victim cache as empty for future gets don't bother
+ // with it.
+ atomic.StoreUintptr(&p.victimSize, 0)
+
return nil
}
@@ -208,17 +236,37 @@
// Because the world is stopped, no pool user can be in a
// pinned section (in effect, this has all Ps pinned).
- for i, p := range allPools {
- allPools[i] = nil
+
+ // Drop victim caches from all pools.
+ for _, p := range oldPools {
+ p.victim = nil
+ p.victimSize = 0
+ }
+
+ // Move primary cache to victim cache.
+ for _, p := range allPools {
+ p.victim = p.local
+ p.victimSize = p.localSize
p.local = nil
p.localSize = 0
}
- allPools = []*Pool{}
+
+ // The pools with non-empty primary caches now have non-empty
+ // victim caches and no pools have primary caches.
+ oldPools, allPools = allPools, nil
}
var (
allPoolsMu Mutex
- allPools []*Pool
+
+ // allPools is the set of pools that have non-empty primary
+ // caches. Protected by either 1) allPoolsMu and pinning or 2)
+ // STW.
+ allPools []*Pool
+
+ // oldPools is the set of pools that may have non-empty victim
+ // caches. Protected by STW.
+ oldPools []*Pool
)
func init() {