runtime: less aggressive per-thread stack segment caching
Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.
benchmark old ns/op new ns/op delta
BenchmarkStackGrowth 665 656 -1.35%
BenchmarkStackGrowth-2 333 328 -1.50%
BenchmarkStackGrowth-4 224 172 -23.21%
BenchmarkStackGrowth-8 124 91 -26.13%
BenchmarkStackGrowth-16 82 47 -41.94%
BenchmarkStackGrowth-32 73 40 -44.79%
BenchmarkStackGrowthDeep 97231 94391 -2.92%
BenchmarkStackGrowthDeep-2 47230 58562 +23.99%
BenchmarkStackGrowthDeep-4 24993 49356 +97.48%
BenchmarkStackGrowthDeep-8 15105 30072 +99.09%
BenchmarkStackGrowthDeep-16 10005 15623 +56.15%
BenchmarkStackGrowthDeep-32 12517 13069 +4.41%
TestStackMem#1,MB 310 12 -96.13%
TestStackMem#2,MB 296 14 -95.27%
TestStackMem#3,MB 479 14 -97.08%
TestStackMem#1,sec 3.22 2.26 -29.81%
TestStackMem#2,sec 2.43 2.15 -11.52%
TestStackMem#3,sec 2.50 2.38 -4.80%
R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 1d51c52..0bbf9fa 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -53,7 +53,7 @@
}
}
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
@@ -63,7 +63,7 @@
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
- stackGrowthRecursive(10)
+ stackGrowthRecursive(rec)
}
}
c <- true
@@ -74,6 +74,14 @@
}
}
+func BenchmarkStackGrowth(b *testing.B) {
+ benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+ benchmarkStackGrowth(b, 1024)
+}
+
func BenchmarkSyscall(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)