runtime: faster finalizers

Linux/amd64, 2 x Intel Xeon E5620, 8 HT cores, 2.40GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkFinalizer              420.00       261.00  -37.86%
BenchmarkFinalizer-2            985.00       201.00  -79.59%
BenchmarkFinalizer-4           1077.00       244.00  -77.34%
BenchmarkFinalizer-8           1155.00       180.00  -84.42%
BenchmarkFinalizer-16          1182.00       184.00  -84.43%

BenchmarkFinalizerRun          2128.00      1378.00  -35.24%
BenchmarkFinalizerRun-2        1655.00      1418.00  -14.32%
BenchmarkFinalizerRun-4        1634.00      1522.00   -6.85%
BenchmarkFinalizerRun-8        2213.00      1581.00  -28.56%
BenchmarkFinalizerRun-16       2424.00      1599.00  -34.03%

Darwin/amd64, Intel L9600, 2 cores, 2.13GHz
benchmark                    old ns/op    new ns/op    delta
BenchmarkChanCreation          1451.00       926.00  -36.18%
BenchmarkChanCreation-2        3124.00      1412.00  -54.80%
BenchmarkChanCreation-4        6121.00      2628.00  -57.07%

BenchmarkFinalizer              684.00       420.00  -38.60%
BenchmarkFinalizer-2          11195.00       398.00  -96.44%
BenchmarkFinalizer-4          15862.00       654.00  -95.88%

BenchmarkFinalizerRun          2025.00      1397.00  -31.01%
BenchmarkFinalizerRun-2        3920.00      1447.00  -63.09%
BenchmarkFinalizerRun-4        9471.00      1545.00  -83.69%

R=golang-dev, cw, rsc
CC=golang-dev
https://golang.org/cl/4963057
diff --git a/src/pkg/runtime/mfinal_test.go b/src/pkg/runtime/mfinal_test.go
new file mode 100644
index 0000000..de63271
--- /dev/null
+++ b/src/pkg/runtime/mfinal_test.go
@@ -0,0 +1,64 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+)
+
+func fin(v *int) {
+}
+
+func BenchmarkFinalizer(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	var wg sync.WaitGroup
+	wg.Add(procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			var data [CallsPerSched]*int
+			for i := 0; i < CallsPerSched; i++ {
+				data[i] = new(int)
+			}
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for i := 0; i < CallsPerSched; i++ {
+					runtime.SetFinalizer(data[i], fin)
+				}
+				for i := 0; i < CallsPerSched; i++ {
+					runtime.SetFinalizer(data[i], nil)
+				}
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func BenchmarkFinalizerRun(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	var wg sync.WaitGroup
+	wg.Add(procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for i := 0; i < CallsPerSched; i++ {
+					v := new(int)
+					runtime.SetFinalizer(v, fin)
+				}
+				runtime.GC()
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}