cmd/compile: memory clearing optimization for arm64
Use "STP (ZR, ZR), O(R)" instead of "MOVD ZR, O(R)" to implement memory clearing.
Also improve assembler supports to STP/LDP.
Results (A57@2GHzx8):
benchmark old ns/op new ns/op delta
BenchmarkClearFat8-8 1.00 1.00 +0.00%
BenchmarkClearFat12-8 1.01 1.01 +0.00%
BenchmarkClearFat16-8 1.01 1.01 +0.00%
BenchmarkClearFat24-8 1.52 1.52 +0.00%
BenchmarkClearFat32-8 3.00 2.02 -32.67%
BenchmarkClearFat40-8 3.50 2.52 -28.00%
BenchmarkClearFat48-8 3.50 3.03 -13.43%
BenchmarkClearFat56-8 4.00 3.50 -12.50%
BenchmarkClearFat64-8 4.25 4.00 -5.88%
BenchmarkClearFat128-8 8.01 8.01 +0.00%
BenchmarkClearFat256-8 16.1 16.0 -0.62%
BenchmarkClearFat512-8 32.1 32.0 -0.31%
BenchmarkClearFat1024-8 64.1 64.1 +0.00%
Change-Id: Ie5f5eac271ff685884775005825f206167a5c146
Reviewed-on: https://go-review.googlesource.com/55610
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 08dcf50..fb7cbc2 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -151,12 +151,13 @@
func zeroARM64(w io.Writer) {
// ZR: always zero
- // R16 (aka REGRT1): ptr to memory to be zeroed - 8
+ // R16 (aka REGRT1): ptr to memory to be zeroed
// On return, R16 points to the last zeroed dword.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
- for i := 0; i < 128; i++ {
- fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
+ for i := 0; i < 63; i++ {
+ fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
}
+ fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
fmt.Fprintln(w, "\tRET")
}