sync: scalable Pool
Introduce fixed-size P-local caches.
When local caches overflow/underflow a batch of items
is transferred to/from global mutex-protected cache.

benchmark                    old ns/op    new ns/op    delta
BenchmarkPool                    50554        22423  -55.65%
BenchmarkPool-4                 400359         5904  -98.53%
BenchmarkPool-16                403311         1598  -99.60%
BenchmarkPool-32                367310         1526  -99.58%

BenchmarkPoolOverlflow            5214         3633  -30.32%
BenchmarkPoolOverlflow-4         42663         9539  -77.64%
BenchmarkPoolOverlflow-8         46919        11385  -75.73%
BenchmarkPoolOverlflow-16        39454        13048  -66.93%

BenchmarkSprintfEmpty                    84           63  -25.68%
BenchmarkSprintfEmpty-2                 371           32  -91.13%
BenchmarkSprintfEmpty-4                 465           22  -95.25%
BenchmarkSprintfEmpty-8                 565           12  -97.77%
BenchmarkSprintfEmpty-16                498            5  -98.87%
BenchmarkSprintfEmpty-32                492            4  -99.04%

BenchmarkSprintfString                  259          229  -11.58%
BenchmarkSprintfString-2                574          144  -74.91%
BenchmarkSprintfString-4                651           77  -88.05%
BenchmarkSprintfString-8                868           47  -94.48%
BenchmarkSprintfString-16               825           33  -95.96%
BenchmarkSprintfString-32               825           30  -96.28%

BenchmarkSprintfInt                     213          188  -11.74%
BenchmarkSprintfInt-2                   448          138  -69.20%
BenchmarkSprintfInt-4                   624           52  -91.63%
BenchmarkSprintfInt-8                   691           31  -95.43%
BenchmarkSprintfInt-16                  724           18  -97.46%
BenchmarkSprintfInt-32                  718           16  -97.70%

BenchmarkSprintfIntInt                  311          282   -9.32%
BenchmarkSprintfIntInt-2                333          145  -56.46%
BenchmarkSprintfIntInt-4                642          110  -82.87%
BenchmarkSprintfIntInt-8                832           42  -94.90%
BenchmarkSprintfIntInt-16               817           24  -97.00%
BenchmarkSprintfIntInt-32               805           22  -97.17%

BenchmarkSprintfPrefixedInt             309          269  -12.94%
BenchmarkSprintfPrefixedInt-2           245          168  -31.43%
BenchmarkSprintfPrefixedInt-4           598           99  -83.36%
BenchmarkSprintfPrefixedInt-8           770           67  -91.23%
BenchmarkSprintfPrefixedInt-16          829           54  -93.49%
BenchmarkSprintfPrefixedInt-32          824           50  -93.83%

BenchmarkSprintfFloat                   418          398   -4.78%
BenchmarkSprintfFloat-2                 295          203  -31.19%
BenchmarkSprintfFloat-4                 585          128  -78.12%
BenchmarkSprintfFloat-8                 873           60  -93.13%
BenchmarkSprintfFloat-16                884           33  -96.24%
BenchmarkSprintfFloat-32                881           29  -96.62%

BenchmarkManyArgs                      1097         1069   -2.55%
BenchmarkManyArgs-2                     705          567  -19.57%
BenchmarkManyArgs-4                     792          319  -59.72%
BenchmarkManyArgs-8                     963          172  -82.14%
BenchmarkManyArgs-16                   1115          103  -90.76%
BenchmarkManyArgs-32                   1133           90  -92.03%

LGTM=rsc
R=golang-codereviews, bradfitz, minux.ma, gobot, rsc
CC=golang-codereviews
https://golang.org/cl/46010043
diff --git a/src/pkg/sync/pool_test.go b/src/pkg/sync/pool_test.go
index e4aeda4..3bf5131 100644
--- a/src/pkg/sync/pool_test.go
+++ b/src/pkg/sync/pool_test.go
@@ -11,7 +11,6 @@
 	"sync/atomic"
 	"testing"
 	"time"
-	"unsafe"
 )
 
 func TestPool(t *testing.T) {
@@ -125,28 +124,41 @@
 }
 
 func BenchmarkPool(b *testing.B) {
-	procs := runtime.GOMAXPROCS(-1)
-	var dec func() bool
-	if unsafe.Sizeof(b.N) == 8 {
-		n := int64(b.N)
-		dec = func() bool {
-			return atomic.AddInt64(&n, -1) >= 0
-		}
-	} else {
-		n := int32(b.N)
-		dec = func() bool {
-			return atomic.AddInt32(&n, -1) >= 0
-		}
-	}
 	var p Pool
 	var wg WaitGroup
-	for i := 0; i < procs; i++ {
+	n0 := uintptr(b.N)
+	n := n0
+	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			for dec() {
-				p.Put(1)
-				p.Get()
+			for atomic.AddUintptr(&n, ^uintptr(0)) < n0 {
+				for b := 0; b < 100; b++ {
+					p.Put(1)
+					p.Get()
+				}
+			}
+		}()
+	}
+	wg.Wait()
+}
+
+func BenchmarkPoolOverlflow(b *testing.B) {
+	var p Pool
+	var wg WaitGroup
+	n0 := uintptr(b.N)
+	n := n0
+	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for atomic.AddUintptr(&n, ^uintptr(0)) < n0 {
+				for b := 0; b < 100; b++ {
+					p.Put(1)
+				}
+				for b := 0; b < 100; b++ {
+					p.Get()
+				}
 			}
 		}()
 	}