src/simd/archsimd/_gen/simdgen/xed.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package main

 import (
 	"cmp"
 	"fmt"
 	"log"
 	"maps"
 	"reflect"
 	"regexp"
 	"slices"
 	"strconv"
 	"strings"

 	"simd/archsimd/_gen/unify"

 	"golang.org/x/arch/x86/xeddata"
 	"gopkg.in/yaml.v3"
 )

 const (
 	NOT_REG_CLASS = iota // not a register
 	VREG_CLASS           // classify as a vector register; see
 	GREG_CLASS           // classify as a general register
 )

 // instVariant is a bitmap indicating a variant of an instruction that has
 // optional parameters.
 type instVariant uint8

 const (
 	instVariantNone instVariant = 0

 	// instVariantMasked indicates that this is the masked variant of an
 	// optionally-masked instruction.
 	instVariantMasked instVariant = 1 << iota
 )

 var operandRemarks int

 // TODO: Doc. Returns Values with Def domains.
 func loadXED(xedPath string) []*unify.Value {
 	// TODO: Obviously a bunch more to do here.

 	db, err := xeddata.NewDatabase(xedPath)
 	if err != nil {
 		log.Fatalf("open database: %v", err)
 	}

 	var defs []*unify.Value
 	type opData struct {
 		inst *xeddata.Inst
 		ops  []operand
 		mem  string
 	}
 	// Maps from opcode to opdata(s).
 	memOps := make(map[string][]opData, 0)
 	otherOps := make(map[string][]opData, 0)
 	appendDefs := func(inst *xeddata.Inst, ops []operand, addFields map[string]string) {
 		applyQuirks(inst, ops)

 		defsPos := len(defs)
 		defs = append(defs, instToUVal(inst, ops, addFields)...)

 		if *flagDebugXED {
 			for i := defsPos; i < len(defs); i++ {
 				y, _ := yaml.Marshal(defs[i])
 				fmt.Printf("==>\n%s\n", y)
 			}
 		}
 	}
 	err = xeddata.WalkInsts(xedPath, func(inst *xeddata.Inst) {
 		inst.Pattern = xeddata.ExpandStates(db, inst.Pattern)

 		switch {
 		case inst.RealOpcode == "N":
 			return // Skip unstable instructions
 		case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA")):
 			// We're only interested in AVX and SHA instructions.
 			return
 		}

 		if *flagDebugXED {
 			fmt.Printf("%s:\n%+v\n", inst.Pos, inst)
 		}

 		ops, err := decodeOperands(db, strings.Fields(inst.Operands))
 		if err != nil {
 			operandRemarks++
 			if *Verbose {
 				log.Printf("%s: [%s] %s", inst.Pos, inst.Opcode(), err)
 			}
 			return
 		}
 		var data map[string][]opData
 		mem := checkMem(ops)
 		if mem == "vbcst" {
 			// A pure vreg variant might exist, wait for later to see if we can
 			// merge them
 			data = memOps
 		} else {
 			data = otherOps
 		}
 		opcode := inst.Opcode()
 		if _, ok := data[opcode]; !ok {
 			s := make([]opData, 1)
 			s[0] = opData{inst, ops, mem}
 			data[opcode] = s
 		} else {
 			data[opcode] = append(data[opcode], opData{inst, ops, mem})
 		}
 	})
 	for _, s := range otherOps {
 		for _, o := range s {
 			addFields := map[string]string{}
 			if o.mem == "noMem" {
 				opcode := o.inst.Opcode()
 				// Checking if there is a vbcst variant of this operation exist
 				// First check the opcode
 				// Keep this logic in sync with [decodeOperands]
 				if ms, ok := memOps[opcode]; ok {
 					feat1, ok1 := decodeCPUFeature(o.inst)
 					// Then check if there exist such an operation that for all vreg
 					// shapes they are the same at the same index
 					var feat1Match, feat2Match string
 					matchIdx := -1
 					var featMismatchCnt int
 				outer:
 					for i, m := range ms {
 						// Their CPU feature should match first
 						var featMismatch bool
 						feat2, ok2 := decodeCPUFeature(m.inst)
 						if !ok1 || !ok2 {
 							continue
 						}
 						if feat1 != feat2 {
 							featMismatch = true
 							featMismatchCnt++
 						}
 						if len(o.ops) == len(m.ops) {
 							for j := range o.ops {
 								if reflect.TypeOf(o.ops[j]) == reflect.TypeOf(m.ops[j]) {
 									v1, ok3 := o.ops[j].(operandVReg)
 									v2, _ := m.ops[j].(operandVReg)
 									if !ok3 {
 										continue
 									}
 									if v1.vecShape != v2.vecShape {
 										// A mismatch, skip this memOp
 										continue outer
 									}
 								} else {
 									_, ok3 := o.ops[j].(operandVReg)
 									_, ok4 := m.ops[j].(operandMem)
 									// The only difference must be the vreg and mem, no other cases.
 									if !ok3 || !ok4 {
 										// A mismatch, skip this memOp
 										continue outer
 									}
 								}
 							}
 							// Found a match, break early
 							matchIdx = i
 							feat1Match = feat1
 							feat2Match = feat2
 							if featMismatchCnt > 1 {
 								panic("multiple feature mismatch vbcst memops detected, simdgen failed to distinguish")
 							}
 							if !featMismatch {
 								// Mismatch feat is ok but should prioritize matching cases.
 								break
 							}
 						}
 					}
 					// Remove the match from memOps, it's now merged to this pure vreg operation
 					if matchIdx != -1 {
 						memOps[opcode] = append(memOps[opcode][:matchIdx], memOps[opcode][matchIdx+1:]...)
 						// Merge is done by adding a new field
 						// Right now we only have vbcst
 						addFields["memFeatures"] = "vbcst"
 						if feat1Match != feat2Match {
 							addFields["memFeaturesData"] = fmt.Sprintf("feat1=%s;feat2=%s", feat1Match, feat2Match)
 						}
 					}
 				}
 			}
 			appendDefs(o.inst, o.ops, addFields)
 		}
 	}
 	for _, ms := range memOps {
 		for _, m := range ms {
 			if *Verbose {
 				log.Printf("mem op not merged: %s, %v\n", m.inst.Opcode(), m)
 			}
 			appendDefs(m.inst, m.ops, nil)
 		}
 	}
 	if err != nil {
 		log.Fatalf("walk insts: %v", err)
 	}

 	if len(unknownFeatures) > 0 {
 		if !*Verbose {
 			nInst := 0
 			for _, insts := range unknownFeatures {
 				nInst += len(insts)
 			}
 			log.Printf("%d unhandled CPU features for %d instructions (use -v for details)", len(unknownFeatures), nInst)
 		} else {
 			keys := slices.SortedFunc(maps.Keys(unknownFeatures), func(a, b cpuFeatureKey) int {
 				return cmp.Or(cmp.Compare(a.Extension, b.Extension),
 					cmp.Compare(a.ISASet, b.ISASet))
 			})
 			for _, key := range keys {
 				if key.ISASet == "" || key.ISASet == key.Extension {
 					log.Printf("unhandled Extension %s", key.Extension)
 				} else {
 					log.Printf("unhandled Extension %s and ISASet %s", key.Extension, key.ISASet)
 				}
 				log.Printf("  opcodes: %s", slices.Sorted(maps.Keys(unknownFeatures[key])))
 			}
 		}
 	}

 	return defs
 }

 var (
 	maskRequiredRe = regexp.MustCompile(`VPCOMPRESS[BWDQ]|VCOMPRESSP[SD]|VPEXPAND[BWDQ]|VEXPANDP[SD]`)
 	maskOptionalRe = regexp.MustCompile(`VPCMP(EQ|GT|U)?[BWDQ]|VCMPP[SD]`)
 )

 func applyQuirks(inst *xeddata.Inst, ops []operand) {
 	opc := inst.Opcode()
 	switch {
 	case maskRequiredRe.MatchString(opc):
 		// The mask on these instructions is marked optional, but the
 		// instruction is pointless without the mask.
 		for i, op := range ops {
 			if op, ok := op.(operandMask); ok {
 				op.optional = false
 				ops[i] = op
 			}
 		}

 	case maskOptionalRe.MatchString(opc):
 		// Conversely, these masks should be marked optional and aren't.
 		for i, op := range ops {
 			if op, ok := op.(operandMask); ok && op.action.r {
 				op.optional = true
 				ops[i] = op
 			}
 		}
 	}
 }

 type operandCommon struct {
 	action operandAction
 }

 // operandAction defines whether this operand is read and/or written.
 //
 // TODO: Should this live in [xeddata.Operand]?
 type operandAction struct {
 	r  bool // Read
 	w  bool // Written
 	cr bool // Read is conditional (implies r==true)
 	cw bool // Write is conditional (implies w==true)
 }

 type operandMem struct {
 	operandCommon
 	vecShape
 	elemBaseType scalarBaseType
 	// The following fields are not flushed to the final output
 	// Supports full-vector broadcasting; implies the operand having a "vv"(vector vector) type specified in width and
 	// the instruction is with attribute TXT=BCASTSTR.
 	vbcst   bool
 	unknown bool // unknown kind
 }

 type vecShape struct {
 	elemBits  int    // Element size in bits
 	bits      int    // Register width in bits (total vector bits)
 	fixedName string // the fixed register name
 }

 type operandVReg struct { // Vector register
 	operandCommon
 	vecShape
 	elemBaseType scalarBaseType
 }

 type operandGReg struct { // Vector register
 	operandCommon
 	vecShape
 	elemBaseType scalarBaseType
 }

 // operandMask is a vector mask.
 //
 // Regardless of the actual mask representation, the [vecShape] of this operand
 // corresponds to the "bit for bit" type of mask. That is, elemBits gives the
 // element width covered by each mask element, and bits/elemBits gives the total
 // number of mask elements. (bits gives the total number of bits as if this were
 // a bit-for-bit mask, which may be meaningless on its own.)
 type operandMask struct {
 	operandCommon
 	vecShape
 	// Bits in the mask is w/bits.

 	allMasks bool // If set, size cannot be inferred because all operands are masks.

 	// Mask can be omitted, in which case it defaults to K0/"no mask"
 	optional bool
 }

 type operandImm struct {
 	operandCommon
 	bits int // Immediate size in bits
 }

 type operand interface {
 	common() operandCommon
 	addToDef(b *unify.DefBuilder)
 }

 func strVal(s any) *unify.Value {
 	return unify.NewValue(unify.NewStringExact(fmt.Sprint(s)))
 }

 func (o operandCommon) common() operandCommon {
 	return o
 }

 func (o operandMem) addToDef(b *unify.DefBuilder) {
 	b.Add("class", strVal("memory"))
 	if o.unknown {
 		return
 	}
 	baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
 	if err != nil {
 		panic("parsing baseRe: " + err.Error())
 	}
 	b.Add("base", unify.NewValue(baseDomain))
 	b.Add("bits", strVal(o.bits))
 	if o.elemBits != o.bits {
 		b.Add("elemBits", strVal(o.elemBits))
 	}
 }

 func (o operandVReg) addToDef(b *unify.DefBuilder) {
 	baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
 	if err != nil {
 		panic("parsing baseRe: " + err.Error())
 	}
 	b.Add("class", strVal("vreg"))
 	b.Add("bits", strVal(o.bits))
 	b.Add("base", unify.NewValue(baseDomain))
 	// If elemBits == bits, then the vector can be ANY shape. This happens with,
 	// for example, logical ops.
 	if o.elemBits != o.bits {
 		b.Add("elemBits", strVal(o.elemBits))
 	}
 	if o.fixedName != "" {
 		b.Add("fixedReg", strVal(o.fixedName))
 	}
 }

 func (o operandGReg) addToDef(b *unify.DefBuilder) {
 	baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
 	if err != nil {
 		panic("parsing baseRe: " + err.Error())
 	}
 	b.Add("class", strVal("greg"))
 	b.Add("bits", strVal(o.bits))
 	b.Add("base", unify.NewValue(baseDomain))
 	if o.elemBits != o.bits {
 		b.Add("elemBits", strVal(o.elemBits))
 	}
 	if o.fixedName != "" {
 		b.Add("fixedReg", strVal(o.fixedName))
 	}
 }

 func (o operandMask) addToDef(b *unify.DefBuilder) {
 	b.Add("class", strVal("mask"))
 	if o.allMasks {
 		// If all operands are masks, omit sizes and let unification determine mask sizes.
 		return
 	}
 	b.Add("elemBits", strVal(o.elemBits))
 	b.Add("bits", strVal(o.bits))
 	if o.fixedName != "" {
 		b.Add("fixedReg", strVal(o.fixedName))
 	}
 }

 func (o operandImm) addToDef(b *unify.DefBuilder) {
 	b.Add("class", strVal("immediate"))
 	b.Add("bits", strVal(o.bits))
 }

 var actionEncoding = map[string]operandAction{
 	"r":   {r: true},
 	"cr":  {r: true, cr: true},
 	"w":   {w: true},
 	"cw":  {w: true, cw: true},
 	"rw":  {r: true, w: true},
 	"crw": {r: true, w: true, cr: true},
 	"rcw": {r: true, w: true, cw: true},
 }

 func decodeOperand(db *xeddata.Database, operand string) (operand, error) {
 	op, err := xeddata.NewOperand(db, operand)
 	if err != nil {
 		log.Fatalf("parsing operand %q: %v", operand, err)
 	}
 	if *flagDebugXED {
 		fmt.Printf("  %+v\n", op)
 	}

 	if strings.HasPrefix(op.Name, "EMX_BROADCAST") {
 		// This refers to a set of macros defined in all-state.txt that set a
 		// BCAST operand to various fixed values. But the BCAST operand is
 		// itself suppressed and "internal", so I think we can just ignore this
 		// operand.
 		return nil, nil
 	}

 	// TODO: See xed_decoded_inst_operand_action. This might need to be more
 	// complicated.
 	action, ok := actionEncoding[op.Action]
 	if !ok {
 		return nil, fmt.Errorf("unknown action %q", op.Action)
 	}
 	common := operandCommon{action: action}

 	lhs := op.NameLHS()
 	if strings.HasPrefix(lhs, "MEM") {
 		// looks like XED data has an inconsistency on VPADDD, marking attribute
 		// VPBROADCASTD instead of the canonical BCASTSTR.
 		if op.Width == "vv" && (op.Attributes["TXT=BCASTSTR"] ||
 			op.Attributes["TXT=VPBROADCASTD"]) {
 			baseType, elemBits, ok := decodeType(op)
 			if !ok {
 				return nil, fmt.Errorf("failed to decode memory width %q", operand)
 			}
 			// This operand has two possible width([bits]):
 			// 1. the same as the other operands
 			// 2. the element width as the other operands (broaccasting)
 			// left it default to 2, later we will set a new field in the operation
 			// to indicate this dual-width property.
 			shape := vecShape{elemBits: elemBits, bits: elemBits}
 			return operandMem{
 				operandCommon: common,
 				vecShape:      shape,
 				elemBaseType:  baseType,
 				vbcst:         true,
 				unknown:       false,
 			}, nil
 		}
 		// TODO: parse op.Width better to handle all cases
 		// Right now this will at least miss VPBROADCAST.
 		return operandMem{
 			operandCommon: common,
 			unknown:       true,
 		}, nil
 	} else if strings.HasPrefix(lhs, "REG") {
 		if op.Width == "mskw" {
 			// The mask operand doesn't specify a width. We have to infer it.
 			//
 			// XED uses the marker ZEROSTR to indicate that a mask operand is
 			// optional and, if omitted, implies K0, aka "no mask".
 			return operandMask{
 				operandCommon: common,
 				optional:      op.Attributes["TXT=ZEROSTR"],
 			}, nil
 		} else {
 			class, regBits, fixedReg := decodeReg(op)
 			if class == NOT_REG_CLASS {
 				return nil, fmt.Errorf("failed to decode register %q", operand)
 			}
 			baseType, elemBits, ok := decodeType(op)
 			if !ok {
 				return nil, fmt.Errorf("failed to decode register width %q", operand)
 			}
 			shape := vecShape{elemBits: elemBits, bits: regBits, fixedName: fixedReg}
 			if class == VREG_CLASS {
 				return operandVReg{
 					operandCommon: common,
 					vecShape:      shape,
 					elemBaseType:  baseType,
 				}, nil
 			}
 			// general register
 			m := min(shape.bits, shape.elemBits)
 			shape.bits, shape.elemBits = m, m
 			return operandGReg{
 				operandCommon: common,
 				vecShape:      shape,
 				elemBaseType:  baseType,
 			}, nil

 		}
 	} else if strings.HasPrefix(lhs, "IMM") {
 		_, bits, ok := decodeType(op)
 		if !ok {
 			return nil, fmt.Errorf("failed to decode register width %q", operand)
 		}
 		return operandImm{
 			operandCommon: common,
 			bits:          bits,
 		}, nil
 	}

 	// TODO: BASE and SEG
 	return nil, fmt.Errorf("unknown operand LHS %q in %q", lhs, operand)
 }

 func decodeOperands(db *xeddata.Database, operands []string) (ops []operand, err error) {
 	// Decode the XED operand descriptions.
 	for _, o := range operands {
 		op, err := decodeOperand(db, o)
 		if err != nil {
 			return nil, err
 		}
 		if op != nil {
 			ops = append(ops, op)
 		}
 	}

 	// XED doesn't encode the size of mask operands. If there are mask operands,
 	// try to infer their sizes from other operands.
 	if err := inferMaskSizes(ops); err != nil {
 		return nil, fmt.Errorf("%w in operands %+v", err, operands)
 	}

 	return ops, nil
 }

 func inferMaskSizes(ops []operand) error {
 	// This is a heuristic and it falls apart in some cases:
 	//
 	// - Mask operations like KAND[BWDQ] have *nothing* in the XED to indicate
 	// mask size.
 	//
 	// - VINSERT*, VPSLL*, VPSRA*, and VPSRL* and some others naturally have
 	// mixed input sizes and the XED doesn't indicate which operands the mask
 	// applies to.
 	//
 	// - VPDP* and VP4DP* have really complex mixed operand patterns.
 	//
 	// I think for these we may just have to hand-write a table of which
 	// operands each mask applies to.
 	inferMask := func(r, w bool) error {
 		var masks []int
 		var rSizes, wSizes, sizes []vecShape
 		allMasks := true
 		hasWMask := false
 		for i, op := range ops {
 			action := op.common().action
 			if _, ok := op.(operandMask); ok {
 				if action.r && action.w {
 					return fmt.Errorf("unexpected rw mask")
 				}
 				if action.r == r || action.w == w {
 					masks = append(masks, i)
 				}
 				if action.w {
 					hasWMask = true
 				}
 			} else {
 				allMasks = false
 				if reg, ok := op.(operandVReg); ok {
 					if action.r {
 						rSizes = append(rSizes, reg.vecShape)
 					}
 					if action.w {
 						wSizes = append(wSizes, reg.vecShape)
 					}
 				}
 			}
 		}
 		if len(masks) == 0 {
 			return nil
 		}

 		if r {
 			sizes = rSizes
 			if len(sizes) == 0 {
 				sizes = wSizes
 			}
 		}
 		if w {
 			sizes = wSizes
 			if len(sizes) == 0 {
 				sizes = rSizes
 			}
 		}

 		if len(sizes) == 0 {
 			// If all operands are masks, leave the mask inferrence to the users.
 			if allMasks {
 				for _, i := range masks {
 					m := ops[i].(operandMask)
 					m.allMasks = true
 					ops[i] = m
 				}
 				return nil
 			}
 			return fmt.Errorf("cannot infer mask size: no register operands")
 		}
 		shape, ok := singular(sizes)
 		if !ok {
 			if !hasWMask && len(wSizes) == 1 && len(masks) == 1 {
 				// This pattern looks like predicate mask, so its shape should align with the
 				// output. TODO: verify this is a safe assumption.
 				shape = wSizes[0]
 			} else {
 				return fmt.Errorf("cannot infer mask size: multiple register sizes %v", sizes)
 			}
 		}
 		for _, i := range masks {
 			m := ops[i].(operandMask)
 			m.vecShape = shape
 			ops[i] = m
 		}
 		return nil
 	}
 	if err := inferMask(true, false); err != nil {
 		return err
 	}
 	if err := inferMask(false, true); err != nil {
 		return err
 	}
 	return nil
 }

 // addOperandstoDef adds "in", "inVariant", and "out" to an instruction Def.
 //
 // Optional mask input operands are added to the inVariant field if
 // variant&instVariantMasked, and omitted otherwise.
 func addOperandsToDef(ops []operand, instDB *unify.DefBuilder, variant instVariant) {
 	var inVals, inVar, outVals []*unify.Value
 	asmPos := 0
 	for _, op := range ops {
 		var db unify.DefBuilder
 		op.addToDef(&db)
 		db.Add("asmPos", unify.NewValue(unify.NewStringExact(fmt.Sprint(asmPos))))

 		action := op.common().action
 		asmCount := 1 // # of assembly operands; 0 or 1
 		if action.r {
 			inVal := unify.NewValue(db.Build())
 			// If this is an optional mask, put it in the input variant tuple.
 			if mask, ok := op.(operandMask); ok && mask.optional {
 				if variant&instVariantMasked != 0 {
 					inVar = append(inVar, inVal)
 				} else {
 					// This operand doesn't appear in the assembly at all.
 					asmCount = 0
 				}
 			} else {
 				// Just a regular input operand.
 				inVals = append(inVals, inVal)
 			}
 		}
 		if action.w {
 			outVal := unify.NewValue(db.Build())
 			outVals = append(outVals, outVal)
 		}

 		asmPos += asmCount
 	}

 	instDB.Add("in", unify.NewValue(unify.NewTuple(inVals...)))
 	instDB.Add("inVariant", unify.NewValue(unify.NewTuple(inVar...)))
 	instDB.Add("out", unify.NewValue(unify.NewTuple(outVals...)))
 	memFeatures := checkMem(ops)
 	if memFeatures != "noMem" {
 		instDB.Add("memFeatures", unify.NewValue(unify.NewStringExact(memFeatures)))
 	}
 }

 // checkMem checks the shapes of memory operand in the operation and returns the shape.
 // Keep this function in sync with [decodeOperand].
 func checkMem(ops []operand) string {
 	memState := "noMem"
 	var mem *operandMem
 	memCnt := 0
 	for _, op := range ops {
 		if m, ok := op.(operandMem); ok {
 			mem = &m
 			memCnt++
 		}
 	}
 	if mem != nil {
 		if mem.unknown {
 			memState = "unknown"
 		} else if memCnt > 1 {
 			memState = "tooManyMem"
 		} else {
 			// We only have vbcst case as of now.
 			// This shape has an indication that [bits] fields has two possible value:
 			// 1. The element broadcast width, which is its peer vreg operand's [elemBits] (default val in the parsed XED data)
 			// 2. The full vector width, which is its peer vreg operand's [bits] (godefs should be aware of this)
 			memState = "vbcst"
 		}
 	}
 	return memState
 }

 func instToUVal(inst *xeddata.Inst, ops []operand, addFields map[string]string) []*unify.Value {
 	feature, ok := decodeCPUFeature(inst)
 	if !ok {
 		return nil
 	}

 	var vals []*unify.Value
 	vals = append(vals, instToUVal1(inst, ops, feature, instVariantNone, addFields))
 	if hasOptionalMask(ops) {
 		vals = append(vals, instToUVal1(inst, ops, feature, instVariantMasked, addFields))
 	}
 	return vals
 }

 func instToUVal1(inst *xeddata.Inst, ops []operand, feature string, variant instVariant, addFields map[string]string) *unify.Value {
 	var db unify.DefBuilder
 	db.Add("goarch", unify.NewValue(unify.NewStringExact("amd64")))
 	db.Add("asm", unify.NewValue(unify.NewStringExact(inst.Opcode())))
 	addOperandsToDef(ops, &db, variant)
 	db.Add("cpuFeature", unify.NewValue(unify.NewStringExact(feature)))
 	for k, v := range addFields {
 		db.Add(k, unify.NewValue(unify.NewStringExact(v)))
 	}

 	if strings.Contains(inst.Pattern, "ZEROING=0") {
 		// This is an EVEX instruction, but the ".Z" (zero-merging)
 		// instruction flag is NOT valid. EVEX.z must be zero.
 		//
 		// This can mean a few things:
 		//
 		// - The output of an instruction is a mask, so merging modes don't
 		// make any sense. E.g., VCMPPS.
 		//
 		// - There are no masks involved anywhere. (Maybe MASK=0 is also set
 		// in this case?) E.g., VINSERTPS.
 		//
 		// - The operation inherently performs merging. E.g., VCOMPRESSPS
 		// with a mem operand.
 		//
 		// There may be other reasons.
 		db.Add("zeroing", unify.NewValue(unify.NewStringExact("false")))
 	}
 	pos := unify.Pos{Path: inst.Pos.Path, Line: inst.Pos.Line}
 	return unify.NewValuePos(db.Build(), pos)
 }

 // decodeCPUFeature returns the CPU feature name required by inst. These match
 // the names of the "Has*" feature checks in the simd package.
 func decodeCPUFeature(inst *xeddata.Inst) (string, bool) {
 	key := cpuFeatureKey{
 		Extension: inst.Extension,
 		ISASet:    isaSetStrip.ReplaceAllLiteralString(inst.ISASet, ""),
 	}
 	feat, ok := cpuFeatureMap[key]
 	if !ok {
 		imap := unknownFeatures[key]
 		if imap == nil {
 			imap = make(map[string]struct{})
 			unknownFeatures[key] = imap
 		}
 		imap[inst.Opcode()] = struct{}{}
 		return "", false
 	}
 	if feat == "ignore" {
 		return "", false
 	}
 	return feat, true
 }

 var isaSetStrip = regexp.MustCompile("_(128N?|256N?|512)$")

 type cpuFeatureKey struct {
 	Extension, ISASet string
 }

 // cpuFeatureMap maps from XED's "EXTENSION" and "ISA_SET" to a CPU feature name
 // that can be used in the SIMD API.
 var cpuFeatureMap = map[cpuFeatureKey]string{
 	{"SHA", "SHA"}: "SHA",

 	{"AVX", ""}:              "AVX",
 	{"AVX_VNNI", "AVX_VNNI"}: "AVXVNNI",
 	{"AVX2", ""}:             "AVX2",
 	{"AVXAES", ""}:           "AVX, AES",

 	// AVX-512 foundational features. We combine all of these into one "AVX512" feature.
 	{"AVX512EVEX", "AVX512F"}:  "AVX512",
 	{"AVX512EVEX", "AVX512CD"}: "AVX512",
 	{"AVX512EVEX", "AVX512BW"}: "AVX512",
 	{"AVX512EVEX", "AVX512DQ"}: "AVX512",
 	// AVX512VL doesn't appear explicitly in the ISASet. I guess it's implied by
 	// the vector length suffix.

 	// AVX-512 extension features
 	{"AVX512EVEX", "AVX512_BITALG"}:     "AVX512BITALG",
 	{"AVX512EVEX", "AVX512_GFNI"}:       "AVX512GFNI",
 	{"AVX512EVEX", "AVX512_VBMI2"}:      "AVX512VBMI2",
 	{"AVX512EVEX", "AVX512_VBMI"}:       "AVX512VBMI",
 	{"AVX512EVEX", "AVX512_VNNI"}:       "AVX512VNNI",
 	{"AVX512EVEX", "AVX512_VPOPCNTDQ"}:  "AVX512VPOPCNTDQ",
 	{"AVX512EVEX", "AVX512_VAES"}:       "AVX512VAES",
 	{"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",

 	// AVX 10.2 (not yet supported)
 	{"AVX512EVEX", "AVX10_2_RC"}: "ignore",
 }

 var unknownFeatures = map[cpuFeatureKey]map[string]struct{}{}

 // hasOptionalMask returns whether there is an optional mask operand in ops.
 func hasOptionalMask(ops []operand) bool {
 	for _, op := range ops {
 		if op, ok := op.(operandMask); ok && op.optional {
 			return true
 		}
 	}
 	return false
 }

 func singular[T comparable](xs []T) (T, bool) {
 	if len(xs) == 0 {
 		return *new(T), false
 	}
 	for _, x := range xs[1:] {
 		if x != xs[0] {
 			return *new(T), false
 		}
 	}
 	return xs[0], true
 }

 type fixedReg struct {
 	class int
 	name  string
 	width int
 }

 var fixedRegMap = map[string]fixedReg{
 	"XED_REG_XMM0": {VREG_CLASS, "x0", 128},
 }

 // decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
 // GREG_CLASS_FIXED), width in bits and reg name(if fixed).
 // If the operand cannot be decided as a register, then the clas is NOT_REG_CLASS.
 func decodeReg(op *xeddata.Operand) (class, width int, name string) {
 	// op.Width tells us the total width, e.g.,:
 	//
 	//    dq => 128 bits (XMM)
 	//    qq => 256 bits (YMM)
 	//    mskw => K
 	//    z[iuf?](8|16|32|...) => 512 bits (ZMM)
 	//
 	// But the encoding is really weird and it's not clear if these *always*
 	// mean XMM/YMM/ZMM or if other irregular things can use these large widths.
 	// Hence, we dig into the register sets themselves.

 	if !strings.HasPrefix(op.NameLHS(), "REG") {
 		return NOT_REG_CLASS, 0, ""
 	}
 	// TODO: We shouldn't be relying on the macro naming conventions. We should
 	// use all-dec-patterns.txt, but xeddata doesn't support that table right now.
 	rhs := op.NameRHS()
 	if !strings.HasSuffix(rhs, "()") {
 		if fixedReg, ok := fixedRegMap[rhs]; ok {
 			return fixedReg.class, fixedReg.width, fixedReg.name
 		}
 		return NOT_REG_CLASS, 0, ""
 	}
 	switch {
 	case strings.HasPrefix(rhs, "XMM_"):
 		return VREG_CLASS, 128, ""
 	case strings.HasPrefix(rhs, "YMM_"):
 		return VREG_CLASS, 256, ""
 	case strings.HasPrefix(rhs, "ZMM_"):
 		return VREG_CLASS, 512, ""
 	case strings.HasPrefix(rhs, "GPR64_"), strings.HasPrefix(rhs, "VGPR64_"):
 		return GREG_CLASS, 64, ""
 	case strings.HasPrefix(rhs, "GPR32_"), strings.HasPrefix(rhs, "VGPR32_"):
 		return GREG_CLASS, 32, ""
 	}
 	return NOT_REG_CLASS, 0, ""
 }

 var xtypeRe = regexp.MustCompile(`^([iuf])([0-9]+)$`)

 // scalarBaseType describes the base type of a scalar element. This is a Go
 // type, but without the bit width suffix (with the exception of
 // scalarBaseIntOrUint).
 type scalarBaseType int

 const (
 	scalarBaseInt scalarBaseType = iota
 	scalarBaseUint
 	scalarBaseIntOrUint // Signed or unsigned is unspecified
 	scalarBaseFloat
 	scalarBaseComplex
 	scalarBaseBFloat
 	scalarBaseHFloat
 )

 func (s scalarBaseType) regex() string {
 	switch s {
 	case scalarBaseInt:
 		return "int"
 	case scalarBaseUint:
 		return "uint"
 	case scalarBaseIntOrUint:
 		return "int|uint"
 	case scalarBaseFloat:
 		return "float"
 	case scalarBaseComplex:
 		return "complex"
 	case scalarBaseBFloat:
 		return "BFloat"
 	case scalarBaseHFloat:
 		return "HFloat"
 	}
 	panic(fmt.Sprintf("unknown scalar base type %d", s))
 }

 func decodeType(op *xeddata.Operand) (base scalarBaseType, bits int, ok bool) {
 	// The xtype tells you the element type. i8, i16, i32, i64, f32, etc.
 	//
 	// TODO: Things like AVX2 VPAND have an xtype of u256 because they're
 	// element-width agnostic. Do I map that to all widths, or just omit the
 	// element width and let unification flesh it out? There's no u512
 	// (presumably those are all masked, so elem width matters). These are all
 	// Category: LOGICAL, so maybe we could use that info?

 	// Handle some weird ones.
 	switch op.Xtype {
 	// 8-bit float formats as defined by Open Compute Project "OCP 8-bit
 	// Floating Point Specification (OFP8)".
 	case "bf8": // E5M2 float
 		return scalarBaseBFloat, 8, true
 	case "hf8": // E4M3 float
 		return scalarBaseHFloat, 8, true
 	case "bf16": // bfloat16 float
 		return scalarBaseBFloat, 16, true
 	case "2f16":
 		// Complex consisting of 2 float16s. Doesn't exist in Go, but we can say
 		// what it would be.
 		return scalarBaseComplex, 32, true
 	case "2i8", "2I8":
 		// These just use the lower INT8 in each 16 bit field.
 		// As far as I can tell, "2I8" is a typo.
 		return scalarBaseInt, 8, true
 	case "2u16", "2U16":
 		// some VPDP* has it
 		// TODO: does "z" means it has zeroing?
 		return scalarBaseUint, 16, true
 	case "2i16", "2I16":
 		// some VPDP* has it
 		return scalarBaseInt, 16, true
 	case "4u8", "4U8":
 		// some VPDP* has it
 		return scalarBaseUint, 8, true
 	case "4i8", "4I8":
 		// some VPDP* has it
 		return scalarBaseInt, 8, true
 	}

 	// The rest follow a simple pattern.
 	m := xtypeRe.FindStringSubmatch(op.Xtype)
 	if m == nil {
 		// TODO: Report unrecognized xtype
 		return 0, 0, false
 	}
 	bits, _ = strconv.Atoi(m[2])
 	switch m[1] {
 	case "i", "u":
 		// XED is rather inconsistent about what's signed, unsigned, or doesn't
 		// matter, so merge them together and let the Go definitions narrow as
 		// appropriate. Maybe there's a better way to do this.
 		return scalarBaseIntOrUint, bits, true
 	case "f":
 		return scalarBaseFloat, bits, true
 	default:
 		panic("unreachable")
 	}
 }