time: make timers heap 4-ary

This slightly improves performance when a lot of timers are present

$ misc/benchcmp ../old_timers_m.txt ../new_timers_m.txt
benchmark                           old ns/op    new ns/op    delta
BenchmarkAfterFunc                       6884         6605   -4.05%
BenchmarkAfterFunc-2                     4473         4144   -7.36%
BenchmarkAfterFunc-3                     8601         6185  -28.09%
BenchmarkAfterFunc-4                     9378         8773   -6.45%
BenchmarkAfter                           7237         7278   +0.57%
BenchmarkAfter-2                         4638         3923  -15.42%
BenchmarkAfter-3                         8751         6239  -28.71%
BenchmarkAfter-4                         9223         8737   -5.27%
BenchmarkStop                             603          496  -17.74%
BenchmarkStop-2                           795          577  -27.42%
BenchmarkStop-3                           982          680  -30.75%
BenchmarkStop-4                          1164          739  -36.51%
BenchmarkSimultaneousAfterFunc            657          593   -9.74%
BenchmarkSimultaneousAfterFunc-2          816          757   -7.23%
BenchmarkSimultaneousAfterFunc-3          844          830   -1.66%
BenchmarkSimultaneousAfterFunc-4          785          771   -1.78%
BenchmarkStartStop                        238          239   +0.42%
BenchmarkStartStop-2                      249          234   -6.02%
BenchmarkStartStop-3                      271          268   -1.11%
BenchmarkStartStop-4                      293          295   +0.68%

R=golang-dev, dvyukov, bradfitz, r
CC=golang-dev
https://golang.org/cl/13094043
diff --git a/src/pkg/time/sleep_test.go b/src/pkg/time/sleep_test.go
index 603adc9..d21b9cc 100644
--- a/src/pkg/time/sleep_test.go
+++ b/src/pkg/time/sleep_test.go
@@ -9,6 +9,7 @@
 	"fmt"
 	"runtime"
 	"sort"
+	"sync"
 	"sync/atomic"
 	"testing"
 	. "time"
@@ -68,33 +69,94 @@
 	atomic.StoreUint32(&stop, 1)
 }
 
-func BenchmarkAfterFunc(b *testing.B) {
-	i := b.N
-	c := make(chan bool)
-	var f func()
-	f = func() {
-		i--
-		if i >= 0 {
-			AfterFunc(0, f)
-		} else {
-			c <- true
-		}
+func benchmark(b *testing.B, bench func(n int)) {
+	garbage := make([]*Timer, 1<<17)
+	for i := 0; i < len(garbage); i++ {
+		garbage[i] = AfterFunc(Hour, nil)
 	}
 
-	AfterFunc(0, f)
-	<-c
+	const batch = 1000
+	P := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / batch)
+
+	b.ResetTimer()
+
+	var wg sync.WaitGroup
+	wg.Add(P)
+
+	for p := 0; p < P; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				bench(batch)
+			}
+			wg.Done()
+		}()
+	}
+
+	wg.Wait()
+
+	b.StopTimer()
+	for i := 0; i < len(garbage); i++ {
+		garbage[i].Stop()
+	}
+}
+
+func BenchmarkAfterFunc(b *testing.B) {
+	benchmark(b, func(n int) {
+		c := make(chan bool)
+		var f func()
+		f = func() {
+			n--
+			if n >= 0 {
+				AfterFunc(0, f)
+			} else {
+				c <- true
+			}
+		}
+
+		AfterFunc(0, f)
+		<-c
+	})
 }
 
 func BenchmarkAfter(b *testing.B) {
-	for i := 0; i < b.N; i++ {
-		<-After(1)
-	}
+	benchmark(b, func(n int) {
+		for i := 0; i < n; i++ {
+			<-After(1)
+		}
+	})
 }
 
 func BenchmarkStop(b *testing.B) {
-	for i := 0; i < b.N; i++ {
-		NewTimer(1 * Second).Stop()
-	}
+	benchmark(b, func(n int) {
+		for i := 0; i < n; i++ {
+			NewTimer(1 * Second).Stop()
+		}
+	})
+}
+
+func BenchmarkSimultaneousAfterFunc(b *testing.B) {
+	benchmark(b, func(n int) {
+		var wg sync.WaitGroup
+		wg.Add(n)
+		for i := 0; i < n; i++ {
+			AfterFunc(0, wg.Done)
+		}
+		wg.Wait()
+	})
+}
+
+func BenchmarkStartStop(b *testing.B) {
+	benchmark(b, func(n int) {
+		timers := make([]*Timer, n)
+		for i := 0; i < n; i++ {
+			timers[i] = AfterFunc(Hour, nil)
+		}
+
+		for i := 0; i < n; i++ {
+			timers[i].Stop()
+		}
+	})
 }
 
 func TestAfter(t *testing.T) {