cmd/compile: memory clearing optimization for arm64

Use "STP (ZR, ZR), O(R)" instead of "MOVD ZR, O(R)" to implement memory clearing.
Also improve assembler supports to STP/LDP.
Results (A57@2GHzx8):

benchmark                   old ns/op     new ns/op     delta
BenchmarkClearFat8-8        1.00          1.00          +0.00%
BenchmarkClearFat12-8       1.01          1.01          +0.00%
BenchmarkClearFat16-8       1.01          1.01          +0.00%
BenchmarkClearFat24-8       1.52          1.52          +0.00%
BenchmarkClearFat32-8       3.00          2.02          -32.67%
BenchmarkClearFat40-8       3.50          2.52          -28.00%
BenchmarkClearFat48-8       3.50          3.03          -13.43%
BenchmarkClearFat56-8       4.00          3.50          -12.50%
BenchmarkClearFat64-8       4.25          4.00          -5.88%
BenchmarkClearFat128-8      8.01          8.01          +0.00%
BenchmarkClearFat256-8      16.1          16.0          -0.62%
BenchmarkClearFat512-8      32.1          32.0          -0.31%
BenchmarkClearFat1024-8     64.1          64.1          +0.00%

Change-Id: Ie5f5eac271ff685884775005825f206167a5c146
Reviewed-on: https://go-review.googlesource.com/55610
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 08dcf50..fb7cbc2 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -151,12 +151,13 @@
 
 func zeroARM64(w io.Writer) {
 	// ZR: always zero
-	// R16 (aka REGRT1): ptr to memory to be zeroed - 8
+	// R16 (aka REGRT1): ptr to memory to be zeroed
 	// On return, R16 points to the last zeroed dword.
 	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
-	for i := 0; i < 128; i++ {
-		fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
+	for i := 0; i < 63; i++ {
+		fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
 	}
+	fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
 	fmt.Fprintln(w, "\tRET")
 }