src/internal/runtime/gc/scan/mkasm.go - go.git - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 //go:build ignore

 package main

 import (
 	"bytes"
 	"fmt"
 	"io"
 	"log"
 	"os"
 	"slices"
 	"strconv"

 	"internal/runtime/gc"
 	"internal/runtime/gc/internal/gen"
 )

 const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"

 func main() {
 	generate("expand_amd64.s", genExpanders)
 }

 func generate(fileName string, genFunc func(*gen.File)) {
 	var buf bytes.Buffer
 	tee := io.MultiWriter(&buf, os.Stdout)

 	file := gen.NewFile(tee)

 	genFunc(file)

 	fmt.Fprintf(tee, header)
 	file.Compile()

 	f, err := os.Create(fileName)
 	if err != nil {
 		log.Fatal(err)
 	}
 	defer f.Close()
 	_, err = f.Write(buf.Bytes())
 	if err != nil {
 		log.Fatal(err)
 	}
 }

 func genExpanders(file *gen.File) {
 	gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize))
 	for sc, ob := range gc.SizeClassToSize {
 		if gc.SizeClassToNPages[sc] != 1 {
 			// These functions all produce a bitmap that covers exactly one
 			// page.
 			continue
 		}
 		if ob > gc.MinSizeForMallocHeader {
 			// This size class is too big to have a packed pointer/scalar bitmap.
 			break
 		}

 		xf := int(ob) / 8
 		log.Printf("size class %d bytes, expansion %dx", ob, xf)

 		fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
 		ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)

 		if xf == 1 {
 			expandIdentity(ptrObjBits)
 		} else {
 			ok := gfExpander(xf, ptrObjBits)
 			if !ok {
 				log.Printf("failed to generate expander for size class %d", sc)
 			}
 		}
 		file.AddFunc(fn)
 		gcExpandersAVX512[sc] = fn
 	}

 	// Generate table mapping size class to expander PC
 	file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
 }

 // mat8x8 is an 8x8 bit matrix.
 type mat8x8 struct {
 	mat [8]uint8
 }

 func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
 	var out [8]uint64
 	for i, mat := range mats {
 		for j, row := range mat.mat {
 			// For some reason, Intel flips the rows.
 			out[i] |= uint64(row) << ((7 - j) * 8)
 		}
 	}
 	return out
 }

 // expandIdentity implements 1x expansion (that is, no expansion).
 func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) {
 	objBitsLo := gen.Deref(ptrObjBits)
 	objBitsHi := gen.Deref(ptrObjBits.AddConst(64))
 	gen.Return(objBitsLo, objBitsHi)
 }

 // gfExpander produces a function that expands each bit in an input bitmap into
 // f consecutive bits in an output bitmap.
 //
 // The input is
 //
 //	AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
 //
 // The output is
 //
 //	Z1 [64]uint8  = The bottom 512 bits of the expanded bitmap
 //	Z2 [64]uint8  = The top 512 bits of the expanded bitmap
 //
 // TODO(austin): This should Z0/Z1.
 func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool {
 	// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.

 	// TODO(austin): For f >= 8, I suspect there are better ways to do this.
 	//
 	// For example, we could use a mask expansion to get a full byte for each
 	// input bit, and separately create the bytes that blend adjacent bits, then
 	// shuffle those bytes together. Certainly for f >= 16 this makes sense
 	// because each of those bytes will be used, possibly more than once.

 	objBits := gen.Deref(ptrObjBits)

 	type term struct {
 		iByte, oByte int
 		mat          mat8x8
 	}
 	var terms []term

 	// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
 	// the output byte from the appropriate input byte. Gather all of these into
 	// "terms".
 	for oByte := 0; oByte < 1024/8; oByte++ {
 		var byteMat mat8x8
 		iByte := -1
 		for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
 			iBit := oBit / f
 			if iByte == -1 {
 				iByte = iBit / 8
 			} else if iByte != iBit/8 {
 				log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
 				return false
 			}
 			// One way to view this is that the i'th row of the matrix will be
 			// ANDed with the input byte, and the parity of the result will set
 			// the i'th bit in the output. We use a simple 1 bit mask, so the
 			// parity is irrelevant beyond selecting out that one bit.
 			byteMat.mat[oBit%8] = 1 << (iBit % 8)
 		}
 		terms = append(terms, term{iByte, oByte, byteMat})
 	}

 	if false {
 		// Print input byte -> output byte as a matrix
 		maxIByte, maxOByte := 0, 0
 		for _, term := range terms {
 			maxIByte = max(maxIByte, term.iByte)
 			maxOByte = max(maxOByte, term.oByte)
 		}
 		iToO := make([][]rune, maxIByte+1)
 		for i := range iToO {
 			iToO[i] = make([]rune, maxOByte+1)
 		}
 		matMap := make(map[mat8x8]int)
 		for _, term := range terms {
 			i, ok := matMap[term.mat]
 			if !ok {
 				i = len(matMap)
 				matMap[term.mat] = i
 			}
 			iToO[term.iByte][term.oByte] = 'A' + rune(i)
 		}
 		for o := range maxOByte + 1 {
 			fmt.Printf("%d", o)
 			for i := range maxIByte + 1 {
 				fmt.Printf(",")
 				if mat := iToO[i][o]; mat != 0 {
 					fmt.Printf("%c", mat)
 				}
 			}
 			fmt.Println()
 		}
 	}

 	// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
 	// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
 	//
 	//  abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
 	//    mat0     mat1     mat2     mat3     mat4     mat5     mat6     mat7

 	// Group the terms by matrix, but limit each group to 8 terms.
 	const termsPerGroup = 8       // Number of terms we can multiply by the same matrix.
 	const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.

 	matMap := make(map[mat8x8]int)
 	allMats := make(map[mat8x8]bool)
 	var termGroups [][]term
 	for _, term := range terms {
 		allMats[term.mat] = true

 		i, ok := matMap[term.mat]
 		if ok && f > groupsPerSuperGroup {
 			// The output is ultimately produced in two [64]uint8 registers.
 			// Getting every byte in the right place of each of these requires a
 			// final permutation that often requires more than one source.
 			//
 			// Up to 8x expansion, we can get a really nice grouping so we can use
 			// the same 8 matrix vector several times, without producing
 			// permutations that require more than two sources.
 			//
 			// Above 8x, however, we can't get nice matrixes anyway, so we
 			// instead prefer reducing the complexity of the permutations we
 			// need to produce the final outputs. To do this, avoid grouping
 			// together terms that are split across the two registers.
 			outRegister := termGroups[i][0].oByte / 64
 			if term.oByte/64 != outRegister {
 				ok = false
 			}
 		}
 		if !ok {
 			// Start a new term group.
 			i = len(termGroups)
 			matMap[term.mat] = i
 			termGroups = append(termGroups, nil)
 		}

 		termGroups[i] = append(termGroups[i], term)

 		if len(termGroups[i]) == termsPerGroup {
 			// This term group is full.
 			delete(matMap, term.mat)
 		}
 	}

 	for i, termGroup := range termGroups {
 		log.Printf("term group %d:", i)
 		for _, term := range termGroup {
 			log.Printf("  %+v", term)
 		}
 	}

 	// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
 	// as many term groups as we can into each super-group to minimize the
 	// number of matrix multiplies.
 	//
 	// Ideally, we use the same matrix in each super-group, which might mean
 	// doing fewer than 8 multiplies at a time. That's fine because it never
 	// increases the total number of matrix multiplies.
 	//
 	// TODO: Packing the matrixes less densely may let us use more broadcast
 	// loads instead of general permutations, though. That replaces a load of
 	// the permutation with a load of the matrix, but is probably still slightly
 	// better.
 	var sgSize, nSuperGroups int
 	oneMatVec := f <= groupsPerSuperGroup
 	if oneMatVec {
 		// We can use the same matrix in each multiply by doing sgSize
 		// multiplies at a time.
 		sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
 		nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
 	} else {
 		// We can't use the same matrix for each multiply. Just do as many at a
 		// time as we can.
 		//
 		// TODO: This is going to produce several distinct matrixes, when we
 		// probably only need two. Be smarter about how we create super-groups
 		// in this case. Maybe we build up an array of super-groups and then the
 		// loop below just turns them into ops?
 		sgSize = 8
 		nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
 	}

 	// Construct each super-group.
 	var matGroup [8]mat8x8
 	var matMuls []gen.Uint8x64
 	var perm [128]int
 	for sgi := range nSuperGroups {
 		var iperm [64]uint8
 		for i := range iperm {
 			iperm[i] = 0xff // "Don't care"
 		}
 		// Pick off sgSize term groups.
 		superGroup := termGroups[:min(len(termGroups), sgSize)]
 		termGroups = termGroups[len(superGroup):]
 		// Build the matrix and permutations for this super-group.
 		var thisMatGroup [8]mat8x8
 		for i, termGroup := range superGroup {
 			// All terms in this group have the same matrix. Pick one.
 			thisMatGroup[i] = termGroup[0].mat
 			for j, term := range termGroup {
 				// Build the input permutation.
 				iperm[i*termsPerGroup+j] = uint8(term.iByte)
 				// Build the output permutation.
 				perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
 			}
 		}
 		log.Printf("input permutation %d: %v", sgi, iperm)

 		// Check that we're not making more distinct matrixes than expected.
 		if oneMatVec {
 			if sgi == 0 {
 				matGroup = thisMatGroup
 			} else if matGroup != thisMatGroup {
 				log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
 				return false
 			}
 		}

 		// Emit matrix op.
 		matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi))
 		inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi)))
 		matMul := matConst.GF2P8Affine(inOp)
 		matMuls = append(matMuls, matMul)
 	}

 	log.Printf("output permutation: %v", perm)

 	outLo, ok := genShuffle("*_outShufLo", (*[64]int)(perm[:64]), matMuls...)
 	if !ok {
 		log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
 		return false
 	}
 	outHi, ok := genShuffle("*_outShufHi", (*[64]int)(perm[64:]), matMuls...)
 	if !ok {
 		log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
 		return false
 	}
 	gen.Return(outLo, outHi)

 	return true
 }

 func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) {
 	// Construct flattened permutation.
 	var vperm [64]byte

 	// Get the inputs used by this permutation.
 	var inputs []int
 	for i, src := range perm {
 		inputIdx := slices.Index(inputs, src/64)
 		if inputIdx == -1 {
 			inputIdx = len(inputs)
 			inputs = append(inputs, src/64)
 		}
 		vperm[i] = byte(src%64 | (inputIdx << 6))
 	}

 	// Emit instructions for easy cases.
 	switch len(inputs) {
 	case 1:
 		constOp := gen.ConstUint8x64(vperm, name)
 		return args[inputs[0]].Shuffle(constOp), true
 	case 2:
 		constOp := gen.ConstUint8x64(vperm, name)
 		return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true
 	}

 	// Harder case, we need to shuffle in from up to 2 more tables.
 	//
 	// Perform two shuffles. One shuffle will get its data from the first
 	// two inputs, the other shuffle will get its data from the other one
 	// or two inputs. All values they don't care each don't care about will
 	// be zeroed.
 	var vperms [2][64]byte
 	var masks [2]uint64
 	for j, idx := range vperm {
 		for i := range vperms {
 			vperms[i][j] = 0xff // "Don't care"
 		}
 		if idx == 0xff {
 			continue
 		}
 		vperms[idx/128][j] = idx % 128
 		masks[idx/128] |= uint64(1) << j
 	}

 	// Validate that the masks are fully disjoint.
 	if masks[0]^masks[1] != ^uint64(0) {
 		panic("bad shuffle!")
 	}

 	// Generate constants.
 	constOps := make([]gen.Uint8x64, len(vperms))
 	for i, v := range vperms {
 		constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i))
 	}

 	// Generate shuffles.
 	switch len(inputs) {
 	case 3:
 		r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
 		r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1]))
 		return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
 	case 4:
 		r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
 		r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1]))
 		return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
 	}

 	// Too many inputs. To support more, we'd need to separate tables much earlier.
 	// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
 	return args[0], false
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build ignore

	package main

	import (
	"bytes"
	"fmt"
	"io"
	"log"
	"os"
	"slices"
	"strconv"

	"internal/runtime/gc"
	"internal/runtime/gc/internal/gen"
	)

	const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"

	func main() {
	generate("expand_amd64.s", genExpanders)
	}

	func generate(fileName string, genFunc func(*gen.File)) {
	var buf bytes.Buffer
	tee := io.MultiWriter(&buf, os.Stdout)

	file := gen.NewFile(tee)

	genFunc(file)

	fmt.Fprintf(tee, header)
	file.Compile()

	f, err := os.Create(fileName)
	if err != nil {
	log.Fatal(err)
	}
	defer f.Close()
	_, err = f.Write(buf.Bytes())
	if err != nil {
	log.Fatal(err)
	}
	}

	func genExpanders(file *gen.File) {
	gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize))
	for sc, ob := range gc.SizeClassToSize {
	if gc.SizeClassToNPages[sc] != 1 {
	// These functions all produce a bitmap that covers exactly one
	// page.
	continue
	}
	if ob > gc.MinSizeForMallocHeader {
	// This size class is too big to have a packed pointer/scalar bitmap.
	break
	}

	xf := int(ob) / 8
	log.Printf("size class %d bytes, expansion %dx", ob, xf)

	fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
	ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)

	if xf == 1 {
	expandIdentity(ptrObjBits)
	} else {
	ok := gfExpander(xf, ptrObjBits)
	if !ok {
	log.Printf("failed to generate expander for size class %d", sc)
	}
	}
	file.AddFunc(fn)
	gcExpandersAVX512[sc] = fn
	}

	// Generate table mapping size class to expander PC
	file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
	}

	// mat8x8 is an 8x8 bit matrix.
	type mat8x8 struct {
	mat [8]uint8
	}

	func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
	var out [8]uint64
	for i, mat := range mats {
	for j, row := range mat.mat {
	// For some reason, Intel flips the rows.
	out[i] \|= uint64(row) << ((7 - j) * 8)
	}
	}
	return out
	}

	// expandIdentity implements 1x expansion (that is, no expansion).
	func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) {
	objBitsLo := gen.Deref(ptrObjBits)
	objBitsHi := gen.Deref(ptrObjBits.AddConst(64))
	gen.Return(objBitsLo, objBitsHi)
	}

	// gfExpander produces a function that expands each bit in an input bitmap into
	// f consecutive bits in an output bitmap.
	//
	// The input is
	//
	// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
	//
	// The output is
	//
	// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
	// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
	//
	// TODO(austin): This should Z0/Z1.
	func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool {
	// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.

	// TODO(austin): For f >= 8, I suspect there are better ways to do this.
	//
	// For example, we could use a mask expansion to get a full byte for each
	// input bit, and separately create the bytes that blend adjacent bits, then
	// shuffle those bytes together. Certainly for f >= 16 this makes sense
	// because each of those bytes will be used, possibly more than once.

	objBits := gen.Deref(ptrObjBits)

	type term struct {
	iByte, oByte int
	mat mat8x8
	}
	var terms []term

	// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
	// the output byte from the appropriate input byte. Gather all of these into
	// "terms".
	for oByte := 0; oByte < 1024/8; oByte++ {
	var byteMat mat8x8
	iByte := -1
	for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
	iBit := oBit / f
	if iByte == -1 {
	iByte = iBit / 8
	} else if iByte != iBit/8 {
	log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
	return false
	}
	// One way to view this is that the i'th row of the matrix will be
	// ANDed with the input byte, and the parity of the result will set
	// the i'th bit in the output. We use a simple 1 bit mask, so the
	// parity is irrelevant beyond selecting out that one bit.
	byteMat.mat[oBit%8] = 1 << (iBit % 8)
	}
	terms = append(terms, term{iByte, oByte, byteMat})
	}

	if false {
	// Print input byte -> output byte as a matrix
	maxIByte, maxOByte := 0, 0
	for _, term := range terms {
	maxIByte = max(maxIByte, term.iByte)
	maxOByte = max(maxOByte, term.oByte)
	}
	iToO := make([][]rune, maxIByte+1)
	for i := range iToO {
	iToO[i] = make([]rune, maxOByte+1)
	}
	matMap := make(map[mat8x8]int)
	for _, term := range terms {
	i, ok := matMap[term.mat]
	if !ok {
	i = len(matMap)
	matMap[term.mat] = i
	}
	iToO[term.iByte][term.oByte] = 'A' + rune(i)
	}
	for o := range maxOByte + 1 {
	fmt.Printf("%d", o)
	for i := range maxIByte + 1 {
	fmt.Printf(",")
	if mat := iToO[i][o]; mat != 0 {
	fmt.Printf("%c", mat)
	}
	}
	fmt.Println()
	}
	}

	// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
	// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
	//
	// abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
	// mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7

	// Group the terms by matrix, but limit each group to 8 terms.
	const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
	const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.

	matMap := make(map[mat8x8]int)
	allMats := make(map[mat8x8]bool)
	var termGroups [][]term
	for _, term := range terms {
	allMats[term.mat] = true

	i, ok := matMap[term.mat]
	if ok && f > groupsPerSuperGroup {
	// The output is ultimately produced in two [64]uint8 registers.
	// Getting every byte in the right place of each of these requires a
	// final permutation that often requires more than one source.
	//
	// Up to 8x expansion, we can get a really nice grouping so we can use
	// the same 8 matrix vector several times, without producing
	// permutations that require more than two sources.
	//
	// Above 8x, however, we can't get nice matrixes anyway, so we
	// instead prefer reducing the complexity of the permutations we
	// need to produce the final outputs. To do this, avoid grouping
	// together terms that are split across the two registers.
	outRegister := termGroups[i][0].oByte / 64
	if term.oByte/64 != outRegister {
	ok = false
	}
	}
	if !ok {
	// Start a new term group.
	i = len(termGroups)
	matMap[term.mat] = i
	termGroups = append(termGroups, nil)
	}

	termGroups[i] = append(termGroups[i], term)

	if len(termGroups[i]) == termsPerGroup {
	// This term group is full.
	delete(matMap, term.mat)
	}
	}

	for i, termGroup := range termGroups {
	log.Printf("term group %d:", i)
	for _, term := range termGroup {
	log.Printf(" %+v", term)
	}
	}

	// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
	// as many term groups as we can into each super-group to minimize the
	// number of matrix multiplies.
	//
	// Ideally, we use the same matrix in each super-group, which might mean
	// doing fewer than 8 multiplies at a time. That's fine because it never
	// increases the total number of matrix multiplies.
	//
	// TODO: Packing the matrixes less densely may let us use more broadcast
	// loads instead of general permutations, though. That replaces a load of
	// the permutation with a load of the matrix, but is probably still slightly
	// better.
	var sgSize, nSuperGroups int
	oneMatVec := f <= groupsPerSuperGroup
	if oneMatVec {
	// We can use the same matrix in each multiply by doing sgSize
	// multiplies at a time.
	sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
	nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
	} else {
	// We can't use the same matrix for each multiply. Just do as many at a
	// time as we can.
	//
	// TODO: This is going to produce several distinct matrixes, when we
	// probably only need two. Be smarter about how we create super-groups
	// in this case. Maybe we build up an array of super-groups and then the
	// loop below just turns them into ops?
	sgSize = 8
	nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
	}

	// Construct each super-group.
	var matGroup [8]mat8x8
	var matMuls []gen.Uint8x64
	var perm [128]int
	for sgi := range nSuperGroups {
	var iperm [64]uint8
	for i := range iperm {
	iperm[i] = 0xff // "Don't care"
	}
	// Pick off sgSize term groups.
	superGroup := termGroups[:min(len(termGroups), sgSize)]
	termGroups = termGroups[len(superGroup):]
	// Build the matrix and permutations for this super-group.
	var thisMatGroup [8]mat8x8
	for i, termGroup := range superGroup {
	// All terms in this group have the same matrix. Pick one.
	thisMatGroup[i] = termGroup[0].mat
	for j, term := range termGroup {
	// Build the input permutation.
	iperm[i*termsPerGroup+j] = uint8(term.iByte)
	// Build the output permutation.
	perm[term.oByte] = sgigroupsPerSuperGrouptermsPerGroup + i*termsPerGroup + j
	}
	}
	log.Printf("input permutation %d: %v", sgi, iperm)

	// Check that we're not making more distinct matrixes than expected.
	if oneMatVec {
	if sgi == 0 {
	matGroup = thisMatGroup
	} else if matGroup != thisMatGroup {
	log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
	return false
	}
	}

	// Emit matrix op.
	matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi))
	inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi)))
	matMul := matConst.GF2P8Affine(inOp)
	matMuls = append(matMuls, matMul)
	}

	log.Printf("output permutation: %v", perm)

	outLo, ok := genShuffle("_outShufLo", ([64]int)(perm[:64]), matMuls...)
	if !ok {
	log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
	return false
	}
	outHi, ok := genShuffle("_outShufHi", ([64]int)(perm[64:]), matMuls...)
	if !ok {
	log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
	return false
	}
	gen.Return(outLo, outHi)

	return true
	}

	func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) {
	// Construct flattened permutation.
	var vperm [64]byte

	// Get the inputs used by this permutation.
	var inputs []int
	for i, src := range perm {
	inputIdx := slices.Index(inputs, src/64)
	if inputIdx == -1 {
	inputIdx = len(inputs)
	inputs = append(inputs, src/64)
	}
	vperm[i] = byte(src%64 \| (inputIdx << 6))
	}

	// Emit instructions for easy cases.
	switch len(inputs) {
	case 1:
	constOp := gen.ConstUint8x64(vperm, name)
	return args[inputs[0]].Shuffle(constOp), true
	case 2:
	constOp := gen.ConstUint8x64(vperm, name)
	return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true
	}

	// Harder case, we need to shuffle in from up to 2 more tables.
	//
	// Perform two shuffles. One shuffle will get its data from the first
	// two inputs, the other shuffle will get its data from the other one
	// or two inputs. All values they don't care each don't care about will
	// be zeroed.
	var vperms [2][64]byte
	var masks [2]uint64
	for j, idx := range vperm {
	for i := range vperms {
	vperms[i][j] = 0xff // "Don't care"
	}
	if idx == 0xff {
	continue
	}
	vperms[idx/128][j] = idx % 128
	masks[idx/128] \|= uint64(1) << j
	}

	// Validate that the masks are fully disjoint.
	if masks[0]^masks[1] != ^uint64(0) {
	panic("bad shuffle!")
	}

	// Generate constants.
	constOps := make([]gen.Uint8x64, len(vperms))
	for i, v := range vperms {
	constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i))
	}

	// Generate shuffles.
	switch len(inputs) {
	case 3:
	r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
	r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1]))
	return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
	case 4:
	r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
	r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1]))
	return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
	}

	// Too many inputs. To support more, we'd need to separate tables much earlier.
	// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
	return args[0], false
	}