runtime: faster entersyscall, exitsyscall
Uses atomic memory accesses to avoid the need to acquire
and release schedlock on fast paths.
benchmark old ns/op new ns/op delta
runtime_test.BenchmarkSyscall 73 31 -56.63%
runtime_test.BenchmarkSyscall-2 538 74 -86.23%
runtime_test.BenchmarkSyscall-3 508 103 -79.72%
runtime_test.BenchmarkSyscall-4 721 97 -86.52%
runtime_test.BenchmarkSyscallWork 920 873 -5.11%
runtime_test.BenchmarkSyscallWork-2 516 481 -6.78%
runtime_test.BenchmarkSyscallWork-3 550 343 -37.64%
runtime_test.BenchmarkSyscallWork-4 632 263 -58.39%
(Intel Core i7 L640 2.13 GHz-based Lenovo X201s)
Reduced a less artificial server benchmark
from 11.5r 12.0u 8.0s to 8.3r 9.1u 1.0s.
R=dvyukov, r, bradfitz, r, iant, iant
CC=golang-dev
https://golang.org/cl/4723042
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 46b41cd..3211108 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -73,3 +73,53 @@
<-c
}
}
+
+func BenchmarkSyscall(b *testing.B) {
+ const CallsPerSched = 1000
+ procs := runtime.GOMAXPROCS(-1)
+ N := int32(b.N / CallsPerSched)
+ c := make(chan bool, procs)
+ for p := 0; p < procs; p++ {
+ go func() {
+ for atomic.AddInt32(&N, -1) >= 0 {
+ runtime.Gosched()
+ for g := 0; g < CallsPerSched; g++ {
+ runtime.Entersyscall()
+ runtime.Exitsyscall()
+ }
+ }
+ c <- true
+ }()
+ }
+ for p := 0; p < procs; p++ {
+ <-c
+ }
+}
+
+func BenchmarkSyscallWork(b *testing.B) {
+ const CallsPerSched = 1000
+ const LocalWork = 100
+ procs := runtime.GOMAXPROCS(-1)
+ N := int32(b.N / CallsPerSched)
+ c := make(chan bool, procs)
+ for p := 0; p < procs; p++ {
+ go func() {
+ foo := 42
+ for atomic.AddInt32(&N, -1) >= 0 {
+ runtime.Gosched()
+ for g := 0; g < CallsPerSched; g++ {
+ runtime.Entersyscall()
+ for i := 0; i < LocalWork; i++ {
+ foo *= 2
+ foo /= 2
+ }
+ runtime.Exitsyscall()
+ }
+ }
+ c <- foo == 42
+ }()
+ }
+ for p := 0; p < procs; p++ {
+ <-c
+ }
+}