runtime: less aggressive per-thread stack segment caching
Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.

benchmark                      old ns/op    new ns/op    delta
BenchmarkStackGrowth                 665          656   -1.35%
BenchmarkStackGrowth-2               333          328   -1.50%
BenchmarkStackGrowth-4               224          172  -23.21%
BenchmarkStackGrowth-8               124           91  -26.13%
BenchmarkStackGrowth-16               82           47  -41.94%
BenchmarkStackGrowth-32               73           40  -44.79%

BenchmarkStackGrowthDeep           97231        94391   -2.92%
BenchmarkStackGrowthDeep-2         47230        58562  +23.99%
BenchmarkStackGrowthDeep-4         24993        49356  +97.48%
BenchmarkStackGrowthDeep-8         15105        30072  +99.09%
BenchmarkStackGrowthDeep-16        10005        15623  +56.15%
BenchmarkStackGrowthDeep-32        12517        13069   +4.41%

TestStackMem#1,MB                  310          12       -96.13%
TestStackMem#2,MB                  296          14       -95.27%
TestStackMem#3,MB                  479          14       -97.08%

TestStackMem#1,sec                 3.22         2.26     -29.81%
TestStackMem#2,sec                 2.43         2.15     -11.52%
TestStackMem#3,sec                 2.50         2.38      -4.80%

R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 1d51c52..0bbf9fa 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -53,7 +53,7 @@
 	}
 }
 
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
 	N := int32(b.N / CallsPerSched)
@@ -63,7 +63,7 @@
 			for atomic.AddInt32(&N, -1) >= 0 {
 				runtime.Gosched()
 				for g := 0; g < CallsPerSched; g++ {
-					stackGrowthRecursive(10)
+					stackGrowthRecursive(rec)
 				}
 			}
 			c <- true
@@ -74,6 +74,14 @@
 	}
 }
 
+func BenchmarkStackGrowth(b *testing.B) {
+	benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+	benchmarkStackGrowth(b, 1024)
+}
+
 func BenchmarkSyscall(b *testing.B) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)