src/cmd/internal/obj/arm64/inst.go - go.git - Git at Google

 // Copyright 2026 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package arm64

 import (
 	"cmd/internal/obj"
 	"fmt"
 	"iter"
 	"math"
 	"math/bits"
 )

 // instEncoder represents an instruction encoder.
 type instEncoder struct {
 	goOp      obj.As    // Go opcode mnemonic
 	fixedBits uint32    // Known bits
 	args      []operand // Operands, in Go order
 }

 type varBits struct {
 	// The low and high bit index in the binary encoding, exclusive on hi
 	lo, hi  int
 	encoded bool // If true then its value is already encoded
 	bits    uint32
 }

 // component is the component of an binary encoding.
 // e.g. for operand <Zda>.<T>, <T>'s encoding function might be described as:
 //
 //	For the "Byte and halfword" variant: is the size specifier,
 //	sz	<T>
 //	0	B
 //	1	H
 //	bit range mappings:
 //	sz: [22:23)
 //
 // Then sz is the component of the binary encoding.
 type component uint16

 type elemEncoder struct {
 	fn func(uint32) (uint32, bool)
 	// comp is the component of the binary encoding.
 	comp component
 }

 // operand is the operand type of an instruction.
 type operand struct {
 	class AClass // Operand class, register, constant, memory operation etc.
 	// The elements that this operand includes, this only includes the encoding-related parts
 	// They are represented as a list of pointers to the encoding functions.
 	// The first returned value is the encoded binary, the second is the ok signal.
 	// The encoding functions return the ok signal for deduplication purposes:
 	// For example:
 	//	SDOT  <Zda>.<T>, <Zn>.<Tb>, <Zm>.<Tb>
 	//	SDOT  <Zda>.H, <Zn>.B, <Zm>.B
 	//	SDOT  <Zda>.S, <Zn>.H, <Zm>.H
 	//
 	// <T> and <Tb> are specified in the encoding text, that there is a constraint "T = 4*Tb".
 	// We don't know this fact by looking at the encoding format solely, without this information
 	// the first encoding domain entails the other 2. And at instruction matching phase we simply
 	// cannot deduplicate them. So we defer this deduplication to the encoding phase.
 	// We need the ok signal with [elemEncoder.comp] field to deduplicate them.
 	elemEncoders []elemEncoder
 }

 // opsInProg returns an iterator over the operands ([Addr]) of p
 func opsInProg(p *obj.Prog) iter.Seq[*obj.Addr] {
 	return func(yield func(*obj.Addr) bool) {
 		// Go order: From, Reg, RestArgs..., To
 		// For SVE, Reg is unused as it's so common that registers have arrangements.
 		if p.From.Type != obj.TYPE_NONE {
 			if !yield(&p.From) {
 				return
 			}
 		}
 		for j := range p.RestArgs {
 			if !yield(&p.RestArgs[j].Addr) {
 				return
 			}
 		}
 		if p.To.Type != obj.TYPE_NONE {
 			if !yield(&p.To) {
 				return
 			}
 		}
 	}
 }

 // aclass returns the AClass of an Addr.
 func aclass(a *obj.Addr) AClass {
 	if a.Type == obj.TYPE_REG {
 		if a.Offset&(int64(1)<<62) != 0 {
 			return AC_PREGSEL
 		}
 		if a.Reg >= REG_Z0 && a.Reg <= REG_Z31 {
 			return AC_ZREG
 		}
 		if a.Reg >= REG_P0 && a.Reg <= REG_PN15 {
 			return AC_PREG
 		}
 		if a.Reg >= REG_ARNG && a.Reg < REG_ELEM {
 			return AC_ARNG
 		}
 		if a.Reg >= REG_ZARNG && a.Reg < REG_ZARNGELEM {
 			return AC_ARNG
 		}
 		if a.Reg >= REG_ZARNGELEM && a.Reg < REG_PZELEM {
 			return AC_ARNGIDX
 		}
 		if a.Reg >= REG_PZELEM && a.Reg < REG_PARNGZM {
 			if a.Reg&(1<<5) == 0 {
 				return AC_ZREGIDX
 			} else {
 				return AC_PREGIDX
 			}
 		}
 		if a.Reg >= REG_PARNGZM && a.Reg < REG_PARNGZM_END {
 			switch (a.Reg >> 5) & 15 {
 			case PRED_M, PRED_Z:
 				return AC_PREGZM
 			default:
 				return AC_ARNG
 			}
 		}
 		if a.Reg >= REG_V0 && a.Reg <= REG_V31 {
 			return AC_VREG
 		}
 		if a.Reg >= REG_R0 && a.Reg <= REG_R31 || a.Reg == REG_RSP {
 			return AC_SPZGREG
 		}
 	}
 	if a.Type == obj.TYPE_CONST || a.Type == obj.TYPE_FCONST {
 		return AC_IMM
 	}
 	if a.Type == obj.TYPE_REGLIST {
 		if a.Scale > 0 {
 			return AC_REGLIST_RANGE
 		}
 		switch (a.Offset >> 12) & 0xf {
 		case 0x7:
 			return AC_REGLIST1
 		case 0xa:
 			return AC_REGLIST2
 		case 0x6:
 			return AC_REGLIST3
 		case 0x2:
 			return AC_REGLIST4
 		}
 	}
 	if a.Type == obj.TYPE_MEM {
 		if a.Index == 0 {
 			if a.Scale&-32768 != 0 {
 				return AC_MEMOFFMULVL
 			}
 			return AC_MEMOFF
 		}
 		return AC_MEMEXT
 	}
 	if a.Type == obj.TYPE_SPECIAL {
 		return AC_SPECIAL
 	}
 	panic(fmt.Errorf("unknown AClass, addr = %v\n", a))
 }

 // addrComponent returns the binary (component) of the stored element in a at index, for operand
 // of type aclass.
 //
 // For example, for operand of type AC_ARNG, it has 2 permissible components (identified by index)
 //  0. register: <reg>
 //  1. arrangement: <T>
 //
 // They are stored in a.Reg as:
 //
 //	reg | (arrangement << 5)
 //
 // More details are in the comments in the switch cases of this function.
 func addrComponent(a *obj.Addr, acl AClass, index int) uint32 {
 	switch acl {
 	//	AClass: AC_PREGSEL
 	//	GNU mnemonic: <preg>.<T>[<selreg>, <imm>]
 	//	Go mnemonic:
 	//		[selreg, $idximm](preg.T)
 	//	Encoding:
 	//		Type = TYPE_REG
 	// 		Offset = packed bits: preg (5 bits) | T (4 bits) | selreg (5 bits) | idximm (6 bits) | sentinel (bit 62)
 	case AC_PREGSEL:
 		switch index {
 		case 0:
 			return uint32(a.Offset & 31)
 		case 1:
 			return uint32((a.Offset >> 5) & 15)
 		case 2:
 			return uint32((a.Offset>>9)&31 + REG_R0) // encoding functions assume a ARM64 register
 		case 3:
 			// This is to check the width of selreg, which is guaranteed to be W in AC_PREGSEL.
 			// W check always returns true as it's resolved in the generator already.
 			// So we just return a default value 0 here.
 			return 0
 		case 4:
 			return uint32((a.Offset >> 14) & 63)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_ARNG, AC_PREG, AC_PREGZ, AC_PREGM, AC_ZREG
 	//	GNU mnemonic: <reg>.<T> Or <reg>/<T> (T is M or Z)
 	//	Go mnemonic:
 	//		reg.<T>
 	//	Encoding:
 	//		Type = TYPE_REG
 	// 		Reg = reg | (arrangement or predication << 5)
 	case AC_ARNG, AC_PREG, AC_PREGZM, AC_ZREG:
 		switch index {
 		case 0:
 			return uint32(a.Reg & 31)
 		case 1:
 			return uint32((a.Reg >> 5) & 15)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_ARNGIDX, AC_PREGIDX, AC_ZREGIDX
 	//	GNU mnemonic: <reg>.<T>[<index>]
 	//	Go mnemonic:
 	//		reg.T[index]
 	//	Encoding:
 	//		Type = TYPE_REG
 	// 		Reg = reg | (arrangement << 5)
 	//		Index = index
 	case AC_ARNGIDX, AC_PREGIDX, AC_ZREGIDX:
 		switch index {
 		case 0:
 			return uint32(a.Reg & 31)
 		case 1:
 			// Arrangement
 			return uint32((a.Reg >> 5) & 15)
 		case 2:
 			// Index
 			return uint32(a.Index)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_SPZGREG, AC_VREG
 	//	GNU mnemonic: <width><reg>
 	//	Go mnemonic:
 	//		reg (the width is already represented in the opcode)
 	//	Encoding:
 	//		Type = TYPE_REG
 	// 		Reg = reg
 	case AC_SPZGREG, AC_VREG:
 		switch index {
 		case 0:
 			// These are all width checks, they should map to no-op checks altogether.
 			return 0
 		case 1:
 			return uint32(a.Reg)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_IMM
 	//	GNU mnemonic: <imm>, <shift>
 	//	Go mnemonic:
 	//		$imm<<shift
 	//	Encoding:
 	//		Type = TYPE_CONST or TYPE_FCONST
 	//		Offset = imm (shift already applied)
 	case AC_IMM:
 		switch index {
 		case 0:
 			if a.Type == obj.TYPE_FCONST {
 				switch v := a.Val.(type) {
 				case float64:
 					return math.Float32bits(float32(v))
 				default:
 					panic(fmt.Errorf("unknown float immediate value %v", a.Val))
 				}
 			}
 			return uint32(a.Offset)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_REGLIST1, AC_REGLIST2, AC_REGLIST3, AC_REGLIST4, AC_REGLIST_RANGE
 	//	GNU mnemonic: {reg1.T, reg2.T, ...}
 	//	Go mnemonic:
 	//		[reg1.T, reg2.T, ...]
 	//	Encoding:
 	//		Type = TYPE_REGLIST
 	// 		Offset = register prefix | register count | arrangement (opcode) | first register
 	//		Scale = range size - 1 (if REGLIST_RANGE)
 	case AC_REGLIST1, AC_REGLIST2, AC_REGLIST3, AC_REGLIST4, AC_REGLIST_RANGE:
 		firstReg := int(a.Offset & 31)
 		prefix := a.Offset >> 32 & 0b11
 		sum := 32
 		if prefix == 2 {
 			sum = 16
 		}
 		switch acl {
 		case AC_REGLIST1:
 			if index > 2 {
 				panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 			}
 		case AC_REGLIST2:
 			if index > 4 {
 				panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 			}
 		case AC_REGLIST3:
 			if index > 6 {
 				panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 			}
 		case AC_REGLIST4:
 			if index > 8 {
 				panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 			}
 		case AC_REGLIST_RANGE:
 			// It behaves just like a AC_REGLIST2
 			if index > 4 {
 				panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 			}
 		}
 		switch index % 2 {
 		case 0:
 			// register
 			if a.Scale > 0 {
 				// For register ranges in SVE we allow discontiguous registers.
 				return uint32((firstReg + (index/2)*int(a.Scale)) % sum)
 			}
 			return uint32((firstReg + index/2) % sum)
 		case 1:
 			// arrangement
 			curQ := a.Offset >> 30 & 0b11
 			curSize := a.Offset >> 10 & 0b11
 			switch curQ {
 			case 0:
 				switch curSize {
 				case 0:
 					return ARNG_8B
 				case 1:
 					return ARNG_4H
 				case 2:
 					return ARNG_2S
 				case 3:
 					return ARNG_1D
 				default:
 					panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl))
 				}
 			case 1:
 				switch curSize {
 				case 0:
 					return ARNG_16B
 				case 1:
 					return ARNG_8H
 				case 2:
 					return ARNG_4S
 				case 3:
 					return ARNG_2D
 				default:
 					panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl))
 				}
 			case 2:
 				switch curSize {
 				case 1:
 					return ARNG_B
 				case 2:
 					return ARNG_H
 				case 3:
 					return ARNG_S
 				default:
 					panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl))
 				}
 			case 3:
 				switch curSize {
 				case 1:
 					return ARNG_D
 				case 2:
 					return ARNG_Q
 				default:
 					panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl))
 				}
 			default:
 				panic(fmt.Errorf("unknown Q value at %d in AClass %d", index, acl))
 			}
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_SPECIAL
 	//	GNU mnemonic: <special>
 	//	Go mnemonic:
 	//		special
 	//	Encoding:
 	//		Type = TYPE_SPECIAL
 	//		Offset = SpecialOperand enum value
 	case AC_SPECIAL:
 		switch index {
 		case 0:
 			return uint32(a.Offset)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_MEMOFF, AC_MEMOFFMULVL
 	//	GNU mnemonic: [<reg>.<T>, #<imm>]
 	//	Go mnemonic:
 	//		imm(reg.T)
 	//	Encoding:
 	//		Type = TYPE_MEM
 	//		Reg = Base register (with arrangement if applicable)
 	//		Offset = Immediate offset
 	case AC_MEMOFF, AC_MEMOFFMULVL:
 		switch index {
 		case 0:
 			return uint32(a.Reg & 31)
 		case 1:
 			return uint32((a.Reg >> 5) & 15)
 		case 2:
 			return uint32(a.Offset)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	//	AClass: AC_MEMEXT
 	//	GNU mnemonic: [<reg1>.<T1>, <reg2>.<T2>, <mod> <amount>]
 	//	Go mnemonic:
 	//		(reg2.T2.mod<<amount)(reg1.T1)
 	//	Encoding:
 	//		Type = TYPE_MEM
 	//		Reg = Index register (with arrangement if applicable)
 	//		Index = Base register (with arrangement if applicable)
 	//		Scale = Packed mod and amount
 	case AC_MEMEXT:
 		switch index {
 		case 0:
 			return uint32(a.Index)
 		case 1:
 			return uint32((a.Index >> 5) & 15)
 		case 2:
 			return uint32(a.Reg)
 		case 3:
 			return uint32((a.Reg >> 5) & 15)
 		case 4:
 			// mod is either 1 (UXTW), 2 (SXTW), or 4 (LSL)
 			mod := uint32((a.Scale >> 9) & 0x7)
 			amount := uint32((a.Scale >> 12) & 0x7)
 			if mod == 0 && amount > 0 {
 				// LSL is implied when no extension is specified but amount > 0
 				mod |= 1 << 2
 			}
 			return mod
 		case 5:
 			return uint32((a.Scale >> 12) & 0x7)
 		default:
 			panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl))
 		}
 	}
 	// TODO: handle more AClasses.
 	panic(fmt.Errorf("unknown AClass %d", acl))
 }

 var codeI1Tsz uint32 = 0xffffffff
 var codeImm2Tsz uint32 = 0xfffffffe
 var codeShift161919212223 uint32 = 0xfffffffd
 var codeShift161919212224 uint32 = 0xfffffffc
 var codeShift588102224 uint32 = 0xfffffffb
 var codeLogicalImmArrEncoding uint32 = 0xfffffffa
 var codeImm3Tsize1621 uint32 = 0xfffffff9
 var codeShiftI1TszhTszl uint32 = 0xfffffff8
 var codeNoOp uint32 = 0xfffffff7

 // encodeI1Tsz is the implementation of the following encoding logic:
 // Is the immediate index, in the range 0 to one less than the number of elements in 128 bits, encoded in "i1:tsz".
 // bit range mappings:
 // i1: [20:21)
 // tsz: [16:20)
 // Note:
 //
 //	arr is the arrangement.
 //	This encoding is aligned to the high bit of the box, according to the spec.
 func encodeI1Tsz(v, arr uint32) (uint32, bool) {
 	switch arr {
 	case ARNG_B:
 		if v > 15 {
 			return 0, false
 		}
 		return v << 17, true
 	case ARNG_H:
 		if v > 7 {
 			return 0, false
 		}
 		return v << 18, true
 	case ARNG_S:
 		if v > 3 {
 			return 0, false
 		}
 		return v << 19, true
 	case ARNG_D:
 		if v > 1 {
 			return 0, false
 		}
 		return v << 20, true
 	case ARNG_Q:
 		if v > 0 {
 			return 0, false
 		}
 		return 0, true
 	default:
 		return 0, false
 	}
 }

 // encodeImm2Tsz is the implementation of the following encoding logic:
 // Is the immediate index, in the range 0 to one less than the number of elements in 512 bits, encoded in "imm2:tsz".
 // bit range mappings:
 // imm2: [22:24)
 // tsz: [16:21)
 // Note:
 //
 //	arr is the arrangement.
 //	This encoding is aligned to the high bit of the box, according to the spec.
 func encodeImm2Tsz(v, arr uint32) (uint32, bool) {
 	switch arr {
 	case ARNG_B:
 		if v > 63 {
 			return 0, false
 		}
 		v <<= 1
 		return (v&31)<<16 | (v>>5)<<22, true
 	case ARNG_H:
 		if v > 31 {
 			return 0, false
 		}
 		v <<= 2
 		return (v&31)<<16 | (v>>5)<<22, true
 	case ARNG_S:
 		if v > 15 {
 			return 0, false
 		}
 		v <<= 3
 		return (v&31)<<16 | (v>>5)<<22, true
 	case ARNG_D:
 		if v > 7 {
 			return 0, false
 		}
 		v <<= 4
 		return (v&31)<<16 | (v>>5)<<22, true
 	case ARNG_Q:
 		if v > 3 {
 			return 0, false
 		}
 		v <<= 5
 		return (v&31)<<16 | (v>>5)<<22, true
 	default:
 		return 0, false
 	}
 }

 type arrAlignType int

 const (
 	arrAlignBHSD arrAlignType = iota
 	arrAlignHSD
 	arrAlignBHS
 )

 // encodeShiftTriple encodes an shift immediate value in "tszh:tszl:imm3".
 // tszh, tszl, imm3 are in ranges, sorted by bit position.
 // These shifts are also bounded by arrangement element size.
 func encodeShiftTriple(v uint32, r [6]int, prevAddr *obj.Addr, op obj.As) (uint32, bool) {
 	// The previous op must be a scalable vector, and we need its arrangement.
 	acl := aclass(prevAddr)
 	if acl != AC_ARNG {
 		return 0, false
 	}
 	arr := addrComponent(prevAddr, acl, 1) // Get arrangement
 	elemBits := uint32(0)
 	switch arr {
 	case ARNG_B:
 		elemBits = 8
 	case ARNG_H:
 		elemBits = 16
 	case ARNG_S:
 		elemBits = 32
 	case ARNG_D:
 		elemBits = 64
 	default:
 		return 0, false
 	}
 	if v >= elemBits {
 		return 0, false
 	}
 	var C uint32
 	// Unfortunately these information are in the decoding ASL.
 	// For these instructions, the esize (see comment in the switch below)
 	// is derived from the destination arrangement, however how this function is called is deriving
 	// the esize from one of the source.
 	// We need to address this discrepancy.
 	effectiveEsize := elemBits
 	switch op {
 	case AZRSHRNB, AZRSHRNT, AZSHRNB, AZSHRNT, AZSQRSHRNB, AZSQRSHRNT, AZSQRSHRUNB, AZSQRSHRUNT,
 		AZSQSHRNB, AZSQSHRNT, AZSQSHRUNB, AZSQSHRUNT, AZUQRSHRNB, AZUQRSHRNT, AZUQSHRNB, AZUQSHRNT:
 		effectiveEsize = elemBits / 2
 	}
 	switch op {
 	case AZASR, AZLSR, AZURSHR, AZASRD,
 		AZRSHRNB, AZRSHRNT, AZSHRNB, AZSHRNT, AZSQRSHRNB, AZSQRSHRNT, AZSQRSHRUNB, AZSQRSHRUNT,
 		AZSQSHRNB, AZSQSHRNT, AZSQSHRUNB, AZSQSHRUNT, AZSRSHR, AZUQRSHRNB, AZUQRSHRNT, AZUQSHRNB, AZUQSHRNT,
 		AZURSRA, AZUSRA, AZXAR, AZSRI, AZSRSRA, AZSSRA:
 		// ASL: let shift : integer = (2 * esize) - UInt(tsize::imm3);
 		if v == 0 {
 			return 0, false
 		}
 		C = (2 * effectiveEsize) - v
 	default:
 		// ASL: let shift : integer = UInt(tsize::imm3) - esize;
 		C = effectiveEsize + v
 	}
 	var chunks [3]uint32
 	for i := 0; i < 6; i += 2 {
 		chunks[i/2] = C & ((1 << (r[i+1] - r[i])) - 1)
 		C >>= (r[i+1] - r[i])
 	}
 	return uint32((chunks[0] << r[0]) |
 		(chunks[1] << r[2]) |
 		(chunks[2] << r[4])), true
 }

 // encodeLogicalImmEncoding is the implementation of the following encoding logic:
 // Is the size specifier,
 // imm13	<T>
 // 0xxxxxx0xxxxx	S
 // 0xxxxxx10xxxx	H
 // 0xxxxxx110xxx	B
 // 0xxxxxx1110xx	B
 // 0xxxxxx11110x	B
 // 0xxxxxx11111x	RESERVED
 // 1xxxxxxxxxxxx	D
 // At the meantime:
 // Is a 64, 32, 16 or 8-bit bitmask consisting of replicated 2, 4, 8, 16, 32 or 64 bit fields,
 // each field containing a rotated run of non-zero bits, encoded in the "imm13" field.
 //
 // bit range mappings:
 // imm13: [5:18)
 //
 // ARM created a "clever" recipe that can generate useful repeating 8-64 bit bitmasks.
 // Instead of storing the literal binary number, the processor reads a 13-bit recipe
 // using three fields (bits from high to low):
 // N (1 bit), immr (6 bits), and imms (6 bits).
 //
 // How the recipe works:
 // Every logical immediate represents a repeating pattern (like repeating tiles). The processor
 // uses the three fields to figure out the size of the tile, how many 1s are in the tile, and
 // how far to rotate it.
 // The N bit combined with the upper bits of imms determines the width of the repeating block.
 // Depending on these bits, the fundamental block can be 2, 4, 8, 16, 32, or 64 bits wide.
 // The lower bits of imms dictate exactly how many contiguous 1s exist inside that block.
 // The immr value tells the processor how many bits to rotate that block to the right.
 // Finally, the resulting block is duplicated to fill a standard 64-bit lane.
 func encodeLogicalImmArrEncoding(v uint64, adjacentAddr *obj.Addr) (uint32, bool) {
 	acl := aclass(adjacentAddr)
 	if acl != AC_ARNG {
 		return 0, false
 	}
 	arr := addrComponent(adjacentAddr, acl, 1)

 	// Replicate the given immediate to fill a full 64-bit lane.
 	// This ensures our pattern-shrinking logic naturally respects the vector lane bounds.
 	var val uint64
 	switch arr {
 	case ARNG_B: // 8-bit lane
 		v8 := uint64(v & 0xFF)
 		val = v8 * 0x0101010101010101
 	case ARNG_H: // 16-bit lane
 		v16 := uint64(v & 0xFFFF)
 		val = v16 * 0x0001000100010001
 	case ARNG_S: // 32-bit lane
 		v32 := uint64(v)
 		val = v32 | (v32 << 32)
 	case ARNG_D: // 64-bit lane
 		val = uint64(v)
 	default:
 		return 0, false
 	}

 	// Reject all zeros or all ones (handled by MOV/EOR, invalid for AND/ORR immediates)
 	if val == 0 || val == ^uint64(0) {
 		return 0, false
 	}

 	// Find the absolute smallest repeating pattern size (64 down to 2)
 	size := uint64(64)
 	for size > 2 {
 		half := size / 2
 		mask := (uint64(1) << half) - 1
 		lower := val & mask
 		upper := (val >> half) & mask

 		// If the top half matches the bottom half, shrink our window
 		if lower == upper {
 			size = half
 			val = lower
 		} else {
 			break
 		}
 	}

 	// Count the contiguous ones in this minimal pattern
 	mask := (uint64(1) << size) - 1
 	val &= mask
 	ones := bits.OnesCount64(val)

 	// Find the right-rotation (rot) needed to align the 1s at the bottom
 	expected := (uint64(1) << ones) - 1
 	rot := -1
 	for r := 0; r < int(size); r++ {
 		// Right rotate 'val' by 'r' bits within a 'size'-bit window
 		rotated := ((val >> r) | (val << (int(size) - r))) & mask
 		if rotated == expected {
 			rot = r
 			break
 		}
 	}

 	if rot == -1 {
 		return 0, false
 	}

 	// immr is the amount the hardware must right-rotate the base pattern.
 	// Since 'rot' is how much we right-rotated the target to find the base,
 	// the hardware needs the inverse rotation.
 	immr := uint32((int(size) - rot) % int(size))

 	// If we couldn't find a rotation that forms a perfect contiguous block of 1s, it's invalid.
 	if rot == -1 {
 		return 0, false
 	}

 	// Encode N, immr, and imms
 	n := uint32(0)
 	if size == 64 {
 		n = 1
 	}

 	// The imms prefix is mathematically generated by (~(size*2 - 1) & 0x3F).
 	// We then OR it with the number of ones (minus 1).
 	imms := (uint32(^(size*2 - 1)) & 0x3F) | uint32(ones-1)

 	// Construct the final 13-bit field: N (1) | immr (6) | imms (6)
 	imm13 := (n << 12) | (immr << 6) | imms

 	// Shift by 5 to place imm13 into instruction bits [5:17]
 	return imm13 << 5, true
 }

 // encodeImm3Tsize1621 is the implementation of the following encoding logic:
 // Is the immediate shift amount, in the range 1 to number of bits per element, encoded in "tsize:imm3".
 // bit range mappings:
 // imm3: [16:19)
 // tsize: [19:21)
 //
 // srcArr is the <Tb> in the source reglist (ranged).
 func encodeImm3Tsize1621(v uint32, srcArr uint32) (uint32, bool) {
 	// From ARM ASL: let shift : integer = (2 * esize) - UInt(tsize::imm3);
 	// Then tsize::imm3 = (2 * esize) - shift.
 	var size uint32
 	switch srcArr {
 	case ARNG_H:
 		// It's the destination size, which is half the source.
 		size = 8
 	case ARNG_S:
 		size = 16
 	default:
 		return 0, false
 	}
 	if v < 1 || v > size {
 		return 0, false
 	}
 	return (((2*size - v) & 0x1f) << 16), true
 }

 // encodeShiftI1TszhTszl is the implementation of the following encoding logic:
 // Is the element index, in the range 0 to one less than the number of vector elements in a 128-bit vector register, encoded in "i1:tszh:tszl".
 // bit range mappings:
 // i1: [23:24)
 // tszh: [22:23)
 // tszl: [18:21)
 //
 // arr is the arrangement
 func encodeShiftI1TszhTszl(v uint32, arr uint32) (uint32, bool) {
 	var shift, max uint32
 	switch arr {
 	case ARNG_B:
 		shift = 1
 		max = 16
 	case ARNG_H:
 		shift = 2
 		max = 8
 	case ARNG_S:
 		shift = 3
 		max = 4
 	case ARNG_D:
 		shift = 4
 		max = 2
 	default:
 		return 0, false
 	}
 	if v >= max {
 		return 0, false
 	}
 	v <<= shift
 	return ((v & 0x7) << 18) | ((v >> 3) << 22), true
 }

 // tryEncode tries to encode p with i, it returns the encoded binary and ok signal.
 func (i *instEncoder) tryEncode(p *obj.Prog) (uint32, bool) {
 	bin := i.fixedBits
 	// Some elements are encoded in the same component, they need to be equal.
 	// For example { <Zn1>.<Tb>-<Zn2>.<Tb> }.
 	// The 2 instances of <Tb> must encode to the same value.
 	encoded := map[component]uint32{}
 	var addrs []*obj.Addr
 	for addr := range opsInProg(p) {
 		addrs = append(addrs, addr)
 	}
 	if len(addrs) != len(i.args) {
 		return 0, false
 	}
 	for opIdx, addr := range addrs {
 		if opIdx >= len(i.args) {
 			return 0, false
 		}
 		op := i.args[opIdx]
 		acl := aclass(addr)
 		if acl != op.class {
 			return 0, false
 		}
 		for i, enc := range op.elemEncoders {
 			val := addrComponent(addr, acl, i)
 			if (p.As == AZFCPY || p.As == AZFDUP) && acl == AC_IMM {
 				// These instructions expects ARM's 8-bit float encoding.
 				// Reinterpret the uint32 bits back as a float32, then convert to float64 for chipfloat7
 				fval := float64(math.Float32frombits(val))
 				encode := (&ctxt7{}).chipfloat7(fval)
 				if encode == -1 {
 					// Handle error or return false to indicate mismatch
 					return 0, false
 				}
 				val = uint32(encode)
 			}
 			if b, ok := enc.fn(val); ok || b != 0 {
 				specialB := uint32(b)
 				if !ok {
 					specialB = b
 					switch b {
 					case codeI1Tsz:
 						b, ok = encodeI1Tsz(val, addrComponent(addr, acl, i-1))
 					case codeImm2Tsz:
 						b, ok = encodeImm2Tsz(val, addrComponent(addr, acl, i-1))
 					case codeShift161919212223:
 						b, ok = encodeShiftTriple(val, [6]int{16, 19, 19, 21, 22, 23}, addrs[opIdx+1], p.As)
 					case codeShift161919212224:
 						b, ok = encodeShiftTriple(val, [6]int{16, 19, 19, 21, 22, 24}, addrs[opIdx+1], p.As)
 					case codeShift588102224:
 						b, ok = encodeShiftTriple(val, [6]int{5, 8, 8, 10, 22, 24}, addrs[opIdx+1], p.As)
 					case codeLogicalImmArrEncoding:
 						// Now that we know this is an immediate.
 						// ARM64 allows imm13 to encode up to 64 bits of immediates.
 						// addrComponent is not the right fit here, we need to extract [Offset] fields manually.
 						b, ok = encodeLogicalImmArrEncoding(uint64(addr.Offset), addrs[opIdx+1])
 					case codeImm3Tsize1621:
 						b, ok = encodeImm3Tsize1621(val, addrComponent(addrs[opIdx+1], aclass(addrs[opIdx+1]), 1))
 					case codeShiftI1TszhTszl:
 						b, ok = encodeShiftI1TszhTszl(val, addrComponent(addr, AC_PREGSEL, 1))
 					case codeNoOp:
 						b, ok = 0, true
 					default:
 						panic(fmt.Errorf("unknown encoding function code %d", b))
 					}
 				}
 				if !ok {
 					return 0, false
 				}
 				bin |= b
 				if _, ok := encoded[enc.comp]; ok && b != encoded[enc.comp] {
 					if specialB == codeNoOp {
 						// NoOp encodings don't need checks.
 						continue
 					}
 					return 0, false
 				}
 				if enc.comp != enc_NIL && specialB != codeNoOp {
 					// NoOp encodings don't need bookkeeping.
 					encoded[enc.comp] = b
 				}
 			} else {
 				return 0, false
 			}
 		}
 	}
 	return bin, true
 }