cmd/internal/gc: inline writeBarrierEnabled check before calling writebarrierptr
I believe the benchmarks that get slower are under register pressure,
and not making the call unconditionally makes the pressure worse,
and the register allocator doesn't do a great job. But part of the point
of this sequence is to get the write barriers out of the way so I can work
on the register allocator, so that's okay.
name old new delta
BenchmarkBinaryTree17 17.9s × (1.00,1.01) 18.0s × (0.99,1.01) ~
BenchmarkFannkuch11 4.43s × (1.00,1.00) 4.43s × (1.00,1.00) ~
BenchmarkFmtFprintfEmpty 110ns × (1.00,1.06) 114ns × (0.95,1.05) ~
BenchmarkFmtFprintfString 487ns × (0.99,1.00) 468ns × (0.99,1.01) -4.00%
BenchmarkFmtFprintfInt 450ns × (0.99,1.00) 433ns × (1.00,1.01) -3.88%
BenchmarkFmtFprintfIntInt 762ns × (1.00,1.00) 748ns × (0.99,1.01) -1.84%
BenchmarkFmtFprintfPrefixedInt 584ns × (0.99,1.01) 547ns × (0.99,1.01) -6.26%
BenchmarkFmtFprintfFloat 738ns × (1.00,1.00) 756ns × (1.00,1.01) +2.37%
BenchmarkFmtManyArgs 2.80µs × (1.00,1.01) 2.79µs × (1.00,1.01) ~
BenchmarkGobDecode 39.0ms × (0.99,1.00) 39.6ms × (0.99,1.00) +1.54%
BenchmarkGobEncode 37.8ms × (0.98,1.01) 37.6ms × (1.00,1.01) ~
BenchmarkGzip 661ms × (0.99,1.01) 663ms × (0.99,1.02) ~
BenchmarkGunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) ~
BenchmarkHTTPClientServer 132µs × (0.99,1.01) 132µs × (0.99,1.01) ~
BenchmarkJSONEncode 56.3ms × (0.99,1.01) 56.2ms × (0.99,1.01) ~
BenchmarkJSONDecode 138ms × (0.99,1.01) 138ms × (1.00,1.00) ~
BenchmarkMandelbrot200 6.01ms × (1.00,1.00) 6.03ms × (1.00,1.01) +0.23%
BenchmarkGoParse 10.2ms × (0.87,1.05) 9.8ms × (0.93,1.10) ~
BenchmarkRegexpMatchEasy0_32 208ns × (1.00,1.00) 207ns × (1.00,1.00) ~
BenchmarkRegexpMatchEasy0_1K 588ns × (1.00,1.00) 581ns × (1.00,1.01) -1.27%
BenchmarkRegexpMatchEasy1_32 182ns × (0.99,1.01) 185ns × (0.99,1.01) +1.65%
BenchmarkRegexpMatchEasy1_1K 986ns × (1.00,1.01) 975ns × (1.00,1.01) -1.17%
BenchmarkRegexpMatchMedium_32 323ns × (1.00,1.01) 328ns × (0.99,1.00) +1.55%
BenchmarkRegexpMatchMedium_1K 89.9µs × (1.00,1.00) 88.6µs × (1.00,1.01) -1.38%
BenchmarkRegexpMatchHard_32 4.72µs × (0.95,1.01) 4.69µs × (0.95,1.03) ~
BenchmarkRegexpMatchHard_1K 133µs × (1.00,1.01) 133µs × (1.00,1.01) ~
BenchmarkRevcomp 900ms × (1.00,1.05) 902ms × (0.99,1.05) ~
BenchmarkTemplate 168ms × (0.99,1.01) 174ms × (0.99,1.01) +3.30%
BenchmarkTimeParse 637ns × (1.00,1.00) 639ns × (1.00,1.00) +0.31%
BenchmarkTimeFormat 738ns × (1.00,1.00) 736ns × (1.00,1.01) ~
Change-Id: I03ce152852edec404538f6c20eb650fac82e2aa2
Reviewed-on: https://go-review.googlesource.com/9224
Reviewed-by: Austin Clements <austin@google.com>
diff --git a/src/cmd/internal/gc/builtin/runtime.go b/src/cmd/internal/gc/builtin/runtime.go
index ec769e1..179a4dd 100644
--- a/src/cmd/internal/gc/builtin/runtime.go
+++ b/src/cmd/internal/gc/builtin/runtime.go
@@ -108,7 +108,8 @@
func chansend1(chanType *byte, hchan chan<- any, elem *any)
func closechan(hchan any)
-// *byte is really *runtime.Type
+var writeBarrierEnabled bool
+
func writebarrierptr(dst *any, src any)
func writebarrierstring(dst *any, src any)
func writebarrierslice(dst *any, src any)
@@ -144,6 +145,7 @@
func writebarrierfat1110(dst *any, _ uintptr, src any)
func writebarrierfat1111(dst *any, _ uintptr, src any)
+// *byte is really *runtime.Type
func typedmemmove(typ *byte, dst *any, src *any)
func typedslicecopy(typ *byte, dst any, src any) int