runtime: faster entersyscall, exitsyscall

Uses atomic memory accesses to avoid the need to acquire
and release schedlock on fast paths.

benchmark                            old ns/op    new ns/op    delta
runtime_test.BenchmarkSyscall               73           31  -56.63%
runtime_test.BenchmarkSyscall-2            538           74  -86.23%
runtime_test.BenchmarkSyscall-3            508          103  -79.72%
runtime_test.BenchmarkSyscall-4            721           97  -86.52%
runtime_test.BenchmarkSyscallWork          920          873   -5.11%
runtime_test.BenchmarkSyscallWork-2        516          481   -6.78%
runtime_test.BenchmarkSyscallWork-3        550          343  -37.64%
runtime_test.BenchmarkSyscallWork-4        632          263  -58.39%

(Intel Core i7 L640 2.13 GHz-based Lenovo X201s)

Reduced a less artificial server benchmark
from 11.5r 12.0u 8.0s to 8.3r 9.1u 1.0s.

R=dvyukov, r, bradfitz, r, iant, iant
CC=golang-dev
https://golang.org/cl/4723042
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 46b41cd..3211108 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -73,3 +73,53 @@
 		<-c
 	}
 }
+
+func BenchmarkSyscall(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for g := 0; g < CallsPerSched; g++ {
+					runtime.Entersyscall()
+					runtime.Exitsyscall()
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkSyscallWork(b *testing.B) {
+	const CallsPerSched = 1000
+	const LocalWork = 100
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			foo := 42
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for g := 0; g < CallsPerSched; g++ {
+					runtime.Entersyscall()
+					for i := 0; i < LocalWork; i++ {
+						foo *= 2
+						foo /= 2
+					}
+					runtime.Exitsyscall()
+				}
+			}
+			c <- foo == 42
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}