cmd/internal/obj/arm64: enable some SIMD instructions

Enable VBSL, VBIT, VCMTST, VUXTL VUXTL2 and FMOVQ SIMD
instructions required by the issue #40725. And FMOVQ
instrucion is used to move a large constant to a Vn
register.

Add test cases.

Fixes #40725

Change-Id: I1cac1922a0a0165d698a4b73a41f7a5f0a0ad549
Reviewed-on: https://go-review.googlesource.com/c/go/+/249758
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index f0c716a..451ca74 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -145,6 +145,17 @@
 	VZIP2	V10.D2, V13.D2, V3.D2           // a379ca4e
 	VZIP1	V17.S2, V4.S2, V26.S2           // 9a38910e
 	VZIP2	V25.S2, V14.S2, V25.S2          // d979990e
+	VUXTL	V30.B8, V30.H8                  // dea7082f
+	VUXTL	V30.H4, V29.S4                  // dda7102f
+	VUXTL	V29.S2, V2.D2                   // a2a7202f
+	VUXTL2	V30.H8, V30.S4                  // dea7106f
+	VUXTL2	V29.S4, V2.D2                   // a2a7206f
+	VUXTL2	V30.B16, V2.H8                  // c2a7086f
+	VBIT	V21.B16, V25.B16, V4.B16        // 241fb56e
+	VBSL	V23.B16, V3.B16, V7.B16         // 671c776e
+	VCMTST	V2.B8, V29.B8, V2.B8            // a28f220e
+	VCMTST	V2.D2, V23.D2, V3.D2            // e38ee24e
+	VSUB	V2.B8, V30.B8, V30.B8           // de87222e
 	MOVD	(R2)(R6.SXTW), R4               // 44c866f8
 	MOVD	(R3)(R6), R5                    // MOVD	(R3)(R6*1), R5                  // 656866f8
 	MOVD	(R2)(R6), R4                    // MOVD	(R2)(R6*1), R4                  // 446866f8
@@ -186,6 +197,10 @@
 	FMOVS	$(0.96875), F3                  // 03f02d1e
 	FMOVD	$(28.0), F4                     // 0490671e
 
+// move a large constant to a Vd.
+	FMOVD	$0x8040201008040201, V20         // FMOVD	$-9205322385119247871, V20
+	FMOVQ	$0x8040201008040202, V29         // FMOVQ	$-9205322385119247870, V29
+
 	FMOVS	(R2)(R6), F4       // FMOVS (R2)(R6*1), F4    // 446866bc
 	FMOVS	(R2)(R6<<2), F4                               // 447866bc
 	FMOVD	(R2)(R6), F4       // FMOVD (R2)(R6*1), F4    // 446866fc
diff --git a/src/cmd/asm/internal/asm/testdata/arm64error.s b/src/cmd/asm/internal/asm/testdata/arm64error.s
index 9f37781..2a911b4 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64error.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64error.s
@@ -340,4 +340,9 @@
 	MRS	PMSWINC_EL0, R3                                  // ERROR "system register is not readable"
 	MRS	OSLAR_EL1, R3                                    // ERROR "system register is not readable"
 	VLD3R.P	24(R15), [V15.H4,V16.H4,V17.H4]                  // ERROR "invalid post-increment offset"
+	VBIT	V1.H4, V12.H4, V3.H4                             // ERROR "invalid arrangement"
+	VBSL	V1.D2, V12.D2, V3.D2                             // ERROR "invalid arrangement"
+	VUXTL	V30.D2, V30.H8                                   // ERROR "operand mismatch"
+	VUXTL2	V20.B8, V21.H8                                   // ERROR "operand mismatch"
+	VUXTL	V3.D2, V4.B8                                     // ERROR "operand mismatch"
 	RET
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 03e0278..ab065e0 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -874,6 +874,7 @@
 	AFLDPS
 	AFMOVD
 	AFMOVS
+	AFMOVQ
 	AFMULD
 	AFMULS
 	AFNEGD
@@ -987,9 +988,14 @@
 	AVUSHR
 	AVSHL
 	AVSRI
+	AVBSL
+	AVBIT
 	AVTBL
 	AVZIP1
 	AVZIP2
+	AVCMTST
+	AVUXTL
+	AVUXTL2
 	ALAST
 	AB  = obj.AJMP
 	ABL = obj.ACALL
diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go
index 65ecd00..8961f04 100644
--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -381,6 +381,7 @@
 	"FLDPS",
 	"FMOVD",
 	"FMOVS",
+	"FMOVQ",
 	"FMULD",
 	"FMULS",
 	"FNEGD",
@@ -494,8 +495,13 @@
 	"VUSHR",
 	"VSHL",
 	"VSRI",
+	"VBSL",
+	"VBIT",
 	"VTBL",
 	"VZIP1",
 	"VZIP2",
+	"VCMTST",
+	"VUXTL",
+	"VUXTL2",
 	"LAST",
 }
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 0b90e31..7ce18d0 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -393,6 +393,11 @@
 	{AMOVK, C_VCON, C_NONE, C_NONE, C_REG, 33, 4, 0, 0, 0},
 	{AMOVD, C_AACON, C_NONE, C_NONE, C_REG, 4, 4, REGFROM, 0, 0},
 
+	// Move a large constant to a Vn.
+	{AFMOVQ, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
+	{AFMOVD, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
+	{AFMOVS, C_LCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
+
 	/* jump operations */
 	{AB, C_NONE, C_NONE, C_NONE, C_SBRA, 5, 4, 0, 0, 0},
 	{ABL, C_NONE, C_NONE, C_NONE, C_SBRA, 5, 4, 0, 0, 0},
@@ -403,12 +408,14 @@
 	{obj.ARET, C_NONE, C_NONE, C_NONE, C_REG, 6, 4, 0, 0, 0},
 	{obj.ARET, C_NONE, C_NONE, C_NONE, C_ZOREG, 6, 4, 0, 0, 0},
 	{ABEQ, C_NONE, C_NONE, C_NONE, C_SBRA, 7, 4, 0, 0, 0},
-	{AADRP, C_SBRA, C_NONE, C_NONE, C_REG, 60, 4, 0, 0, 0},
-	{AADR, C_SBRA, C_NONE, C_NONE, C_REG, 61, 4, 0, 0, 0},
 	{ACBZ, C_REG, C_NONE, C_NONE, C_SBRA, 39, 4, 0, 0, 0},
 	{ATBZ, C_VCON, C_REG, C_NONE, C_SBRA, 40, 4, 0, 0, 0},
 	{AERET, C_NONE, C_NONE, C_NONE, C_NONE, 41, 4, 0, 0, 0},
 
+	// get a PC-relative address
+	{AADRP, C_SBRA, C_NONE, C_NONE, C_REG, 60, 4, 0, 0, 0},
+	{AADR, C_SBRA, C_NONE, C_NONE, C_REG, 61, 4, 0, 0, 0},
+
 	{ACLREX, C_NONE, C_NONE, C_NONE, C_VCON, 38, 4, 0, 0, 0},
 	{ACLREX, C_NONE, C_NONE, C_NONE, C_NONE, 38, 4, 0, 0, 0},
 	{ABFM, C_VCON, C_REG, C_VCON, C_REG, 42, 4, 0, 0, 0},
@@ -473,6 +480,7 @@
 	{AVTBL, C_ARNG, C_NONE, C_LIST, C_ARNG, 100, 4, 0, 0, 0},
 	{AVUSHR, C_VCON, C_ARNG, C_NONE, C_ARNG, 95, 4, 0, 0, 0},
 	{AVZIP1, C_ARNG, C_ARNG, C_NONE, C_ARNG, 72, 4, 0, 0, 0},
+	{AVUXTL, C_ARNG, C_NONE, C_NONE, C_ARNG, 102, 4, 0, 0, 0},
 
 	/* conditional operations */
 	{ACSEL, C_COND, C_REG, C_REG, C_REG, 18, 4, 0, 0, 0},
@@ -2657,7 +2665,7 @@
 		case AFCSELD:
 			oprangeset(AFCSELS, t)
 
-		case AFMOVS, AFMOVD:
+		case AFMOVS, AFMOVD, AFMOVQ:
 			break
 
 		case AFCVTZSD:
@@ -2740,6 +2748,9 @@
 			oprangeset(AVCMEQ, t)
 			oprangeset(AVORR, t)
 			oprangeset(AVEOR, t)
+			oprangeset(AVBSL, t)
+			oprangeset(AVBIT, t)
+			oprangeset(AVCMTST, t)
 
 		case AVADD:
 			oprangeset(AVSUB, t)
@@ -2787,6 +2798,9 @@
 		case AVZIP1:
 			oprangeset(AVZIP2, t)
 
+		case AVUXTL:
+			oprangeset(AVUXTL2, t)
+
 		case AVLD1R:
 			oprangeset(AVLD2, t)
 			oprangeset(AVLD2R, t)
@@ -4163,7 +4177,7 @@
 		rel.Add = 0
 		rel.Type = objabi.R_ARM64_GOTPCREL
 
-	case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor/vfmla/vfmls Vm.<T>, Vn.<T>, Vd.<T> */
+	case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor/vfmla/vfmls/vbit/vbsl/vcmtst/vsub Vm.<T>, Vn.<T>, Vd.<T> */
 		af := int((p.From.Reg >> 5) & 15)
 		af3 := int((p.Reg >> 5) & 15)
 		at := int((p.To.Reg >> 5) & 15)
@@ -4204,17 +4218,24 @@
 			c.ctxt.Diag("invalid arrangement: %v", p)
 		}
 
-		if (p.As == AVORR || p.As == AVAND || p.As == AVEOR) &&
-			(af != ARNG_16B && af != ARNG_8B) {
-			c.ctxt.Diag("invalid arrangement: %v", p)
-		} else if (p.As == AVFMLA || p.As == AVFMLS) &&
-			(af != ARNG_2D && af != ARNG_2S && af != ARNG_4S) {
-			c.ctxt.Diag("invalid arrangement: %v", p)
-		} else if p.As == AVORR {
-			size = 2
-		} else if p.As == AVAND || p.As == AVEOR {
+		switch p.As {
+		case AVORR, AVAND, AVEOR, AVBIT, AVBSL:
+			if af != ARNG_16B && af != ARNG_8B {
+				c.ctxt.Diag("invalid arrangement: %v", p)
+			}
+		case AVFMLA, AVFMLS:
+			if af != ARNG_2D && af != ARNG_2S && af != ARNG_4S {
+				c.ctxt.Diag("invalid arrangement: %v", p)
+			}
+		}
+		switch p.As {
+		case AVAND, AVEOR:
 			size = 0
-		} else if p.As == AVFMLA || p.As == AVFMLS {
+		case AVBSL:
+			size = 1
+		case AVORR, AVBIT:
+			size = 2
+		case AVFMLA, AVFMLS:
 			if af == ARNG_2D {
 				size = 1
 			} else {
@@ -5096,6 +5117,59 @@
 		o1 = q<<30 | 0xe<<24 | len<<13
 		o1 |= (uint32(rf&31) << 16) | uint32(offset&31)<<5 | uint32(rt&31)
 
+	case 101: // FOMVQ/FMOVD $vcon, Vd -> load from constant pool.
+		o1 = c.omovlit(p.As, p, &p.From, int(p.To.Reg))
+
+	case 102: // VUXTL{2} Vn.<Tb>, Vd.<Ta>
+		af := int((p.From.Reg >> 5) & 15)
+		at := int((p.To.Reg >> 5) & 15)
+		var Q, immh uint32
+		switch at {
+		case ARNG_8H:
+			if af == ARNG_8B {
+				immh = 1
+				Q = 0
+			} else if af == ARNG_16B {
+				immh = 1
+				Q = 1
+			} else {
+				c.ctxt.Diag("operand mismatch: %v\n", p)
+			}
+		case ARNG_4S:
+			if af == ARNG_4H {
+				immh = 2
+				Q = 0
+			} else if af == ARNG_8H {
+				immh = 2
+				Q = 1
+			} else {
+				c.ctxt.Diag("operand mismatch: %v\n", p)
+			}
+		case ARNG_2D:
+			if af == ARNG_2S {
+				immh = 4
+				Q = 0
+			} else if af == ARNG_4S {
+				immh = 4
+				Q = 1
+			} else {
+				c.ctxt.Diag("operand mismatch: %v\n", p)
+			}
+		default:
+			c.ctxt.Diag("operand mismatch: %v\n", p)
+		}
+
+		if p.As == AVUXTL && Q == 1 {
+			c.ctxt.Diag("operand mismatch: %v\n", p)
+		}
+		if p.As == AVUXTL2 && Q == 0 {
+			c.ctxt.Diag("operand mismatch: %v\n", p)
+		}
+
+		o1 = c.oprrr(p, p.As)
+		rf := int((p.From.Reg) & 31)
+		rt := int((p.To.Reg) & 31)
+		o1 |= Q<<30 | immh<<19 | uint32((rf&31)<<5) | uint32(rt&31)
 	}
 	out[0] = o1
 	out[1] = o2
@@ -5662,6 +5736,9 @@
 	case AVADD:
 		return 7<<25 | 1<<21 | 1<<15 | 1<<10
 
+	case AVSUB:
+		return 0x17<<25 | 1<<21 | 1<<15 | 1<<10
+
 	case AVADDP:
 		return 7<<25 | 1<<21 | 1<<15 | 15<<10
 
@@ -5724,6 +5801,18 @@
 
 	case AVLD2R, AVLD4R:
 		return 0xD<<24 | 3<<21
+
+	case AVBIT:
+		return 1<<29 | 0x75<<21 | 7<<10
+
+	case AVBSL:
+		return 1<<29 | 0x73<<21 | 7<<10
+
+	case AVCMTST:
+		return 0xE<<24 | 1<<21 | 0x23<<10
+
+	case AVUXTL, AVUXTL2:
+		return 0x5e<<23 | 0x29<<10
 	}
 
 	c.ctxt.Diag("%v: bad rrr %d %v", p, a, a)
@@ -6566,6 +6655,10 @@
 			fp = 1
 			w = 1 /* 64-bit SIMD/FP */
 
+		case AFMOVQ:
+			fp = 1
+			w = 2 /* 128-bit SIMD/FP */
+
 		case AMOVD:
 			if p.Pool.As == ADWORD {
 				w = 1 /* 64-bit */