fmt: make benchmarks parallel
This seems to be the best target to benchmark sync.Pool changes.

This is resend of cl/49910043 which was LGTMed by
TBR=bradfitz

R=golang-codereviews
CC=golang-codereviews
https://golang.org/cl/50140045
diff --git a/src/pkg/fmt/fmt_test.go b/src/pkg/fmt/fmt_test.go
index 444297d..971e8a3 100644
--- a/src/pkg/fmt/fmt_test.go
+++ b/src/pkg/fmt/fmt_test.go
@@ -11,6 +11,7 @@
 	"math"
 	"runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unicode"
@@ -606,46 +607,66 @@
 }
 
 func BenchmarkSprintfEmpty(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("")
-	}
+	})
 }
 
 func BenchmarkSprintfString(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("%s", "hello")
-	}
+	})
 }
 
 func BenchmarkSprintfInt(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("%d", 5)
-	}
+	})
 }
 
 func BenchmarkSprintfIntInt(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("%d %d", 5, 6)
-	}
+	})
 }
 
 func BenchmarkSprintfPrefixedInt(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("This is some meaningless prefix text that needs to be scanned %d", 6)
-	}
+	})
 }
 
 func BenchmarkSprintfFloat(b *testing.B) {
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		Sprintf("%g", 5.23184)
-	}
+	})
 }
 
 func BenchmarkManyArgs(b *testing.B) {
-	var buf bytes.Buffer
-	for i := 0; i < b.N; i++ {
+	benchmarkSprintf(b, func(buf *bytes.Buffer) {
 		buf.Reset()
-		Fprintf(&buf, "%2d/%2d/%2d %d:%d:%d %s %s\n", 3, 4, 5, 11, 12, 13, "hello", "world")
+		Fprintf(buf, "%2d/%2d/%2d %d:%d:%d %s %s\n", 3, 4, 5, 11, 12, 13, "hello", "world")
+	})
+}
+
+func benchmarkSprintf(b *testing.B, f func(buf *bytes.Buffer)) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			var buf bytes.Buffer
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					f(&buf)
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
 	}
 }