cmd/internal/obj/arm64: optimize the instruction of moving long effective stack address

Currently, when the offset of "MOVD $offset(Rn), Rd" is a large positive
constant or a negative constant, the assembler will load this offset from
the constant pool.This patch gets rid of the constant pool by encoding the
offset into two ADD instructions if it's a large positive constant or one
SUB instruction if negative. For very large negative offset, it is rarely
used, here we don't optimize this case.

Optimized case 1: MOVD $-0x100000(R7), R0
Before: LDR 0x67670(constant pool), R27; ADD R27.UXTX, R0, R7
After: SUB $0x100000, R7, R0

Optimized case 2: MOVD $0x123468(R7), R0
Before: LDR 0x67670(constant pool), R27; ADD R27.UXTX, R0, R7
After: ADD $0x123000, R7, R27; ADD $0x000468, R27, R0

1. Binary size before/after.
binary                 size change
pkg/linux_arm64        +4KB
pkg/tool/linux_arm64   no change
go                     no change
gofmt                  no change

2. go1 benckmark.
name                      old time/op                new time/op                delta
pkg:test/bench/go1 goos:linux goarch:arm64
BinaryTree17-64           7335721401.800000ns +-40%  6264542009.800000ns +-14%    ~     (p=0.421 n=5+5)
Fannkuch11-64             3886551822.600000ns +- 0%  3875870590.200000ns +- 0%    ~     (p=0.151 n=5+5)
FmtFprintfEmpty-64                82.960000ns +- 1%          83.900000ns +- 2%  +1.13%  (p=0.048 n=5+5)
FmtFprintfString-64              149.200000ns +- 1%         148.000000ns +- 0%  -0.80%  (p=0.016 n=5+4)
FmtFprintfInt-64                 177.000000ns +- 0%         178.400000ns +- 2%    ~     (p=0.794 n=4+5)
FmtFprintfIntInt-64              240.200000ns +- 2%         239.400000ns +- 4%    ~     (p=0.302 n=5+5)
FmtFprintfPrefixedInt-64         300.400000ns +- 0%         299.200000ns +- 1%    ~     (p=0.119 n=5+5)
FmtFprintfFloat-64               360.000000ns +- 0%         361.600000ns +- 3%    ~     (p=0.349 n=4+5)
FmtManyArgs-64                  1064.400000ns +- 1%        1061.400000ns +- 0%    ~     (p=0.087 n=5+5)
GobDecode-64                12080404.400000ns +- 2%    11637601.000000ns +- 1%  -3.67%  (p=0.008 n=5+5)
GobEncode-64                 8474973.800000ns +- 2%     7977801.600000ns +- 2%  -5.87%  (p=0.008 n=5+5)
Gzip-64                    416501238.400000ns +- 0%   410463405.400000ns +- 0%  -1.45%  (p=0.008 n=5+5)
Gunzip-64                   58088415.200000ns +- 0%    58826209.600000ns +- 0%  +1.27%  (p=0.008 n=5+5)
HTTPClientServer-64           128660.200000ns +-23%      117840.800000ns +- 8%    ~     (p=0.222 n=5+5)
JSONEncode-64               17547746.800000ns +- 4%    17216180.000000ns +- 1%    ~     (p=0.222 n=5+5)
JSONDecode-64               80879896.000000ns +- 1%    80063737.200000ns +- 0%  -1.01%  (p=0.008 n=5+5)
Mandelbrot200-64             5484901.600000ns +- 0%     5483614.400000ns +- 0%    ~     (p=0.310 n=5+5)
GoParse-64                   6201166.800000ns +- 6%     6150920.600000ns +- 1%    ~     (p=0.548 n=5+5)
RegexpMatchEasy0_32-64           135.000000ns +- 0%         139.200000ns +- 7%    ~     (p=0.643 n=5+5)
RegexpMatchEasy0_1K-64           484.600000ns +- 2%         483.800000ns +- 2%    ~     (p=0.984 n=5+5)
RegexpMatchEasy1_32-64           128.000000ns +- 1%         124.600000ns +- 1%  -2.66%  (p=0.008 n=5+5)
RegexpMatchEasy1_1K-64           769.400000ns +- 2%         761.400000ns +- 1%    ~     (p=0.460 n=5+5)
RegexpMatchMedium_32-64           12.900000ns +- 0%          12.500000ns +- 0%  -3.10%  (p=0.008 n=5+5)
RegexpMatchMedium_1K-64        57879.200000ns +- 1%       56512.200000ns +- 0%  -2.36%  (p=0.008 n=5+5)
RegexpMatchHard_32-64           3091.600000ns +- 1%        3071.000000ns +- 0%  -0.67%  (p=0.048 n=5+5)
RegexpMatchHard_1K-64          92941.200000ns +- 1%       92794.000000ns +- 0%    ~     (p=1.000 n=5+5)
Revcomp-64                1695605187.000000ns +-54%  1821697637.400000ns +-47%    ~     (p=1.000 n=5+5)
Template-64                112839686.800000ns +- 1%   109964069.200000ns +- 3%    ~     (p=0.095 n=5+5)
TimeParse-64                     587.000000ns +- 0%         587.000000ns +- 0%    ~     (all equal)
TimeFormat-64                    586.000000ns +- 1%         584.200000ns +- 1%    ~     (p=0.659 n=5+5)
[Geo mean]                      81804.262218ns             80694.712973ns       -1.36%

name                      old speed                  new speed                  delta
pkg:test/bench/go1 goos:linux goarch:arm64
GobDecode-64                         63.6MB/s +- 2%             66.0MB/s +- 1%  +3.78%  (p=0.008 n=5+5)
GobEncode-64                         90.6MB/s +- 2%             96.2MB/s +- 2%  +6.23%  (p=0.008 n=5+5)
Gzip-64                              46.6MB/s +- 0%             47.3MB/s +- 0%  +1.47%  (p=0.008 n=5+5)
Gunzip-64                             334MB/s +- 0%              330MB/s +- 0%  -1.25%  (p=0.008 n=5+5)
JSONEncode-64                         111MB/s +- 4%              113MB/s +- 1%    ~     (p=0.222 n=5+5)
JSONDecode-64                        24.0MB/s +- 1%             24.2MB/s +- 0%  +1.02%  (p=0.008 n=5+5)
GoParse-64                           9.35MB/s +- 6%             9.42MB/s +- 1%    ~     (p=0.571 n=5+5)
RegexpMatchEasy0_32-64                237MB/s +- 0%              231MB/s +- 7%    ~     (p=0.690 n=5+5)
RegexpMatchEasy0_1K-64               2.11GB/s +- 2%             2.12GB/s +- 2%    ~     (p=1.000 n=5+5)
RegexpMatchEasy1_32-64                250MB/s +- 1%              257MB/s +- 1%  +2.63%  (p=0.008 n=5+5)
RegexpMatchEasy1_1K-64               1.33GB/s +- 2%             1.35GB/s +- 1%    ~     (p=0.548 n=5+5)
RegexpMatchMedium_32-64              77.6MB/s +- 0%             79.8MB/s +- 0%  +2.80%  (p=0.008 n=5+5)
RegexpMatchMedium_1K-64              17.7MB/s +- 1%             18.1MB/s +- 0%  +2.41%  (p=0.008 n=5+5)
RegexpMatchHard_32-64                10.4MB/s +- 1%             10.4MB/s +- 0%    ~     (p=0.056 n=5+5)
RegexpMatchHard_1K-64                11.0MB/s +- 1%             11.0MB/s +- 0%    ~     (p=0.984 n=5+5)
Revcomp-64                            188MB/s +-71%              155MB/s +-71%    ~     (p=1.000 n=5+5)
Template-64                          17.2MB/s +- 1%             17.7MB/s +- 3%    ~     (p=0.095 n=5+5)
[Geo mean]                            79.2MB/s                   79.3MB/s       +0.24%

Change-Id: I593ac3e7037afafc3605ad4b0cfb51d5dd88015d
Reviewed-on: https://go-review.googlesource.com/c/go/+/232438
Trust: Alberto Donizetti <alb.donizetti@gmail.com>
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index e106ff2..acfb16b 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -340,8 +340,19 @@
 	MOVD	$0x1111ffff1111aaaa, R1       // MOVD	$1230045644216969898, R1    // a1aa8a922122a2f22122e2f2
 	MOVD	$0, R1                        // 010080d2
 	MOVD	$-1, R1                       // 01008092
-	MOVD	$0x210000, R0                 // MOVD	$2162688, R0                 // 2004a0d2
-	MOVD	$0xffffffffffffaaaa, R1       // MOVD	$-21846, R1                  // a1aa8a92
+	MOVD	$0x210000, R0                 // MOVD	$2162688, R0                // 2004a0d2
+	MOVD	$0xffffffffffffaaaa, R1       // MOVD	$-21846, R1                 // a1aa8a92
+
+	MOVD	$0x1002(RSP), R1              // MOVD	$4098(RSP), R1              // fb074091610b0091
+	MOVD	$0x1708(RSP), RSP             // MOVD	$5896(RSP), RSP             // fb0740917f231c91
+	MOVD	$0x2001(R7), R1               // MOVD	$8193(R7), R1               // fb08409161070091
+	MOVD	$0xffffff(R7), R1             // MOVD	$16777215(R7), R1           // fbfc7f9161ff3f91
+
+	MOVD	$-0x1(R7), R1                 // MOVD	$-1(R7), R1                 // e10400d1
+	MOVD	$-0x30(R7), R1                // MOVD	$-48(R7), R1                // e1c000d1
+	MOVD	$-0x708(R7), R1               // MOVD	$-1800(R7), R1              // e1201cd1
+	MOVD	$-0x2000(RSP), R1             // MOVD	$-8192(RSP), R1             // e10b40d1
+	MOVD	$-0x10000(RSP), RSP           // MOVD	$-65536(RSP), RSP           // ff4340d1
 
 //
 // CLS
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 2839da1..b3c9e9a 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -410,9 +410,10 @@
 	C_FCON     // floating-point constant
 	C_VCONADDR // 64-bit memory address
 
-	C_AACON // ADDCON offset in auto constant $a(FP)
-	C_LACON // 32-bit offset in auto constant $a(FP)
-	C_AECON // ADDCON offset in extern constant $e(SB)
+	C_AACON  // ADDCON offset in auto constant $a(FP)
+	C_AACON2 // 24-bit offset in auto constant $a(FP)
+	C_LACON  // 32-bit offset in auto constant $a(FP)
+	C_AECON  // ADDCON offset in extern constant $e(SB)
 
 	// TODO(aram): only one branch class should be enough
 	C_SBRA // for TYPE_BRANCH
diff --git a/src/cmd/internal/obj/arm64/anames7.go b/src/cmd/internal/obj/arm64/anames7.go
index e1703fc..96c9f78 100644
--- a/src/cmd/internal/obj/arm64/anames7.go
+++ b/src/cmd/internal/obj/arm64/anames7.go
@@ -36,6 +36,7 @@
 	"FCON",
 	"VCONADDR",
 	"AACON",
+	"AACON2",
 	"LACON",
 	"AECON",
 	"SBRA",
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index df4bbbb..fc2033d 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -391,7 +391,11 @@
 	{AMOVD, C_VCON, C_NONE, C_NONE, C_REG, 12, 16, 0, NOTUSETMP, 0},
 
 	{AMOVK, C_VCON, C_NONE, C_NONE, C_REG, 33, 4, 0, 0, 0},
-	{AMOVD, C_AACON, C_NONE, C_NONE, C_REG, 4, 4, REGFROM, 0, 0},
+	{AMOVD, C_AACON, C_NONE, C_NONE, C_RSP, 4, 4, REGFROM, 0, 0},
+	{AMOVD, C_AACON2, C_NONE, C_NONE, C_RSP, 4, 8, REGFROM, 0, 0},
+
+	/* load long effective stack address (load int32 offset and add) */
+	{AMOVD, C_LACON, C_NONE, C_NONE, C_RSP, 34, 8, REGSP, LFROM, 0},
 
 	// Move a large constant to a Vn.
 	{AFMOVQ, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
@@ -594,9 +598,6 @@
 	{AFMOVD, C_LAUTO, C_NONE, C_NONE, C_FREG, 31, 8, REGSP, LFROM, 0},
 	{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
 
-	/* load long effective stack address (load int32 offset and add) */
-	{AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 34, 8, REGSP, LFROM, 0},
-
 	/* pre/post-indexed load (unscaled, signed 9-bit offset) */
 	{AMOVD, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
 	{AMOVW, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
@@ -1361,6 +1362,10 @@
 	return v <= 0xFFF
 }
 
+func isaddcon2(v int64) bool {
+	return 0 <= v && v <= 0xFFFFFF
+}
+
 // isbitcon reports whether a constant can be encoded into a logical instruction.
 // bitcon has a binary form of repetition of a bit sequence of length 2, 4, 8, 16, 32, or 64,
 // which itself is a rotate (w.r.t. the length of the unit) of a sequence of ones.
@@ -1889,10 +1894,14 @@
 		default:
 			return C_GOK
 		}
-
-		if isaddcon(c.instoffset) {
+		cf := c.instoffset
+		if isaddcon(cf) || isaddcon(-cf) {
 			return C_AACON
 		}
+		if isaddcon2(cf) {
+			return C_AACON2
+		}
+
 		return C_LACON
 
 	case obj.TYPE_BRANCH:
@@ -2046,7 +2055,7 @@
 		return cmp(C_LCON, b)
 
 	case C_LACON:
-		if b == C_AACON {
+		if b == C_AACON || b == C_AACON2 {
 			return true
 		}
 
@@ -3062,11 +3071,10 @@
 		}
 		o1 |= (uint32(r&31) << 5) | uint32(rt&31)
 
-	case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R */
-		o1 = c.opirr(p, p.As)
-
+	case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R; mov $addcon2, R */
 		rt := int(p.To.Reg)
 		r := int(o.param)
+
 		if r == 0 {
 			r = REGZERO
 		} else if r == REGFROM {
@@ -3075,13 +3083,23 @@
 		if r == 0 {
 			r = REGSP
 		}
+
 		v := int32(c.regoff(&p.From))
-		if (v & 0xFFF000) != 0 {
-			v >>= 12
-			o1 |= 1 << 22 /* shift, by 12 */
+		var op int32
+		if v < 0 {
+			v = -v
+			op = int32(c.opirr(p, ASUB))
+		} else {
+			op = int32(c.opirr(p, AADD))
 		}
 
-		o1 |= ((uint32(v) & 0xFFF) << 10) | (uint32(r&31) << 5) | uint32(rt&31)
+		if int(o.size) == 8 {
+			o1 = c.oaddi(p, op, v&0xfff000, r, REGTMP)
+			o2 = c.oaddi(p, op, v&0x000fff, REGTMP, rt)
+			break
+		}
+
+		o1 = c.oaddi(p, op, v, r, rt)
 
 	case 5: /* b s; bl s */
 		o1 = c.opbra(p, p.As)