sync: smooth out Pool behavior over GC with a victim cache Currently, every Pool is cleared completely at the start of each GC. This is a problem for heavy users of Pool because it causes an allocation spike immediately after Pools are clear, which impacts both throughput and latency. This CL fixes this by introducing a victim cache mechanism. Instead of clearing Pools, the victim cache is dropped and the primary cache is moved to the victim cache. As a result, in steady-state, there are (roughly) no new allocations, but if Pool usage drops, objects will still be collected within two GCs (as opposed to one). This victim cache approach also improves Pool's impact on GC dynamics. The current approach causes all objects in Pools to be short lived. However, if an application is in steady state and is just going to repopulate its Pools, then these objects impact the live heap size *as if* they were long lived. Since Pooled objects count as short lived when computing the GC trigger and goal, but act as long lived objects in the live heap, this causes GC to trigger too frequently. If Pooled objects are a non-trivial portion of an application's heap, this increases the CPU overhead of GC. The victim cache lets Pooled objects affect the GC trigger and goal as long-lived objects. This has no impact on Get/Put performance, but substantially reduces the impact to the Pool user when a GC happens. PoolExpensiveNew demonstrates this in the substantially reduction in the rate at which the "New" function is called. name old time/op new time/op delta Pool-12 2.21ns ±36% 2.00ns ± 0% ~ (p=0.070 n=19+16) PoolOverflow-12 587ns ± 1% 583ns ± 1% -0.77% (p=0.000 n=18+18) PoolSTW-12 5.57µs ± 3% 4.52µs ± 4% -18.82% (p=0.000 n=20+19) PoolExpensiveNew-12 3.69ms ± 7% 1.25ms ± 5% -66.25% (p=0.000 n=20+19) name old p50-ns/STW new p50-ns/STW delta PoolSTW-12 5.48k ± 2% 4.53k ± 2% -17.32% (p=0.000 n=20+20) name old p95-ns/STW new p95-ns/STW delta PoolSTW-12 6.69k ± 4% 5.13k ± 3% -23.31% (p=0.000 n=19+18) name old GCs/op new GCs/op delta PoolExpensiveNew-12 0.39 ± 1% 0.32 ± 2% -17.95% (p=0.000 n=18+20) name old New/op new New/op delta PoolExpensiveNew-12 40.0 ± 6% 12.4 ± 6% -68.91% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20190311.2) Fixes #22950. Change-Id: If2e183d948c650417283076aacc20739682cdd70 Reviewed-on: https://go-review.googlesource.com/c/go/+/166961 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

commit: 2dcbf8b3691e72d1b04e9376488cef3b6f93b286 [log] [tgz]
author: Austin Clements <austin@google.com> Sat Mar 02 15:16:29 2019 -0500
committer: Austin Clements <austin@google.com> Fri Apr 05 18:49:08 2019 +0000
tree: 39495d33b1c89c600db20e6141fd25fac24ddd0a
parent: d5fd2dd6a17a816b7dfd99d4df70a85f1bf0de31 [diff] [blame]
diff --git a/src/sync/pool.go b/src/sync/pool.go
index c447cb7..f58fdd4 100644
--- a/src/sync/pool.go
+++ b/src/sync/pool.go

@@ -47,6 +47,9 @@
 	local     unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal
 	localSize uintptr        // size of the local array
 
+	victim     unsafe.Pointer // local from previous cycle
+	victimSize uintptr        // size of victims array
+
 	// New optionally specifies a function to generate
 	// a value when Get would otherwise return nil.
 	// It may not be changed concurrently with calls to Get.
@@ -150,14 +153,39 @@
 func (p *Pool) getSlow(pid int) interface{} {
 	// See the comment in pin regarding ordering of the loads.
 	size := atomic.LoadUintptr(&p.localSize) // load-acquire
-	local := p.local                         // load-consume
+	locals := p.local                        // load-consume
 	// Try to steal one element from other procs.
 	for i := 0; i < int(size); i++ {
-		l := indexLocal(local, (pid+i+1)%int(size))
+		l := indexLocal(locals, (pid+i+1)%int(size))
 		if x, _ := l.shared.popTail(); x != nil {
 			return x
 		}
 	}
+
+	// Try the victim cache. We do this after attempting to steal
+	// from all primary caches because we want objects in the
+	// victim cache to age out if at all possible.
+	size = atomic.LoadUintptr(&p.victimSize)
+	if uintptr(pid) >= size {
+		return nil
+	}
+	locals = p.victim
+	l := indexLocal(locals, pid)
+	if x := l.private; x != nil {
+		l.private = nil
+		return x
+	}
+	for i := 0; i < int(size); i++ {
+		l := indexLocal(locals, (pid+i)%int(size))
+		if x, _ := l.shared.popTail(); x != nil {
+			return x
+		}
+	}
+
+	// Mark the victim cache as empty for future gets don't bother
+	// with it.
+	atomic.StoreUintptr(&p.victimSize, 0)
+
 	return nil
 }
 
@@ -208,17 +236,37 @@
 
 	// Because the world is stopped, no pool user can be in a
 	// pinned section (in effect, this has all Ps pinned).
-	for i, p := range allPools {
-		allPools[i] = nil
+
+	// Drop victim caches from all pools.
+	for _, p := range oldPools {
+		p.victim = nil
+		p.victimSize = 0
+	}
+
+	// Move primary cache to victim cache.
+	for _, p := range allPools {
+		p.victim = p.local
+		p.victimSize = p.localSize
 		p.local = nil
 		p.localSize = 0
 	}
-	allPools = []*Pool{}
+
+	// The pools with non-empty primary caches now have non-empty
+	// victim caches and no pools have primary caches.
+	oldPools, allPools = allPools, nil
 }
 
 var (
 	allPoolsMu Mutex
-	allPools   []*Pool
+
+	// allPools is the set of pools that have non-empty primary
+	// caches. Protected by either 1) allPoolsMu and pinning or 2)
+	// STW.
+	allPools []*Pool
+
+	// oldPools is the set of pools that may have non-empty victim
+	// caches. Protected by STW.
+	oldPools []*Pool
 )
 
 func init() {
commit	2dcbf8b3691e72d1b04e9376488cef3b6f93b286	[log] [tgz]
author	Austin Clements <austin@google.com>	Sat Mar 02 15:16:29 2019 -0500
committer	Austin Clements <austin@google.com>	Fri Apr 05 18:49:08 2019 +0000
tree	39495d33b1c89c600db20e6141fd25fac24ddd0a
parent	d5fd2dd6a17a816b7dfd99d4df70a85f1bf0de31 [diff] [blame]