src/math/big/internal/asmgen/pipe.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package asmgen

 import (
 	"fmt"
 	"math/bits"
 	"slices"
 )

 // Note: Exported fields and methods are expected to be used
 // by function generators (like the ones in add.go and so on).
 // Unexported fields and methods should not be.

 // A Pipe manages the input and output data pipelines for a function's
 // memory operations.
 //
 // The input is one or more equal-length slices of words, so collectively
 // it can be viewed as a matrix, in which each slice is a row and each column
 // is a set of corresponding words from the different slices.
 // The output can be viewed the same way, although it is often just one row.
 type Pipe struct {
 	f               *Func    // function being generated
 	label           string   // prefix for loop labels (default "loop")
 	backward        bool     // processing columns in reverse
 	started         bool     // Start has been called
 	loaded          bool     // LoadPtrs has been called
 	inPtr           []RegPtr // input slice pointers
 	hints           []Hint   // for each inPtr, a register hint to use for its data
 	outPtr          []RegPtr // output slice pointers
 	index           Reg      // index register, if in use
 	useIndexCounter bool     // index counter requested
 	indexCounter    int      // index is also counter (386); 0 no, -1 negative counter, +1 positive counter
 	readOff         int      // read offset not yet added to index
 	writeOff        int      // write offset not yet added to index
 	factors         []int    // unrolling factors
 	counts          []Reg    // iterations for each factor
 	needWrite       bool     // need a write call during Loop1/LoopN
 	maxColumns      int      // maximum columns during unrolled loop
 	unrollStart     func()   // emit code at start of unrolled body
 	unrollEnd       func()   // emit code end of unrolled body
 }

 // Pipe creates and returns a new pipe for use in the function f.
 func (f *Func) Pipe() *Pipe {
 	a := f.Asm
 	p := &Pipe{
 		f:          f,
 		label:      "loop",
 		maxColumns: 10000000,
 	}
 	if m := a.Arch.maxColumns; m != 0 {
 		p.maxColumns = m
 	}
 	return p
 }

 // SetBackward sets the pipe to process the input and output columns in reverse order.
 // This is needed for left shifts, which might otherwise overwrite data they will read later.
 func (p *Pipe) SetBackward() {
 	if p.loaded {
 		p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs")
 	}
 	p.backward = true
 }

 // SetUseIndexCounter sets the pipe to use an index counter if possible,
 // meaning the loop counter is also used as an index for accessing the slice data.
 // This clever trick is slower on modern processors, but it is still necessary on 386.
 // On non-386 systems, SetUseIndexCounter is a no-op.
 func (p *Pipe) SetUseIndexCounter() {
 	if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)
 		return
 	}
 	p.useIndexCounter = true
 }

 // SetLabel sets the label prefix for the loops emitted by the pipe.
 // The default prefix is "loop".
 func (p *Pipe) SetLabel(label string) {
 	p.label = label
 }

 // SetMaxColumns sets the maximum number of
 // columns processed in a single loop body call.
 func (p *Pipe) SetMaxColumns(m int) {
 	p.maxColumns = m
 }

 // SetHint records that the inputs from the named vector
 // should be allocated with the given register hint.
 //
 // If the hint indicates a single register on the target architecture,
 // then SetHint calls SetMaxColumns(1), since the hinted register
 // can only be used for one value at a time.
 func (p *Pipe) SetHint(name string, hint Hint) {
 	if hint == HintMemOK && !p.f.Asm.Arch.memOK {
 		return
 	}
 	i := slices.Index(p.f.inputs, name)
 	if i < 0 {
 		p.f.Asm.Fatalf("unknown input name %s", name)
 	}
 	if p.f.Asm.hint(hint) != "" {
 		p.SetMaxColumns(1)
 	}
 	for len(p.hints) <= i {
 		p.hints = append(p.hints, HintNone)
 	}
 	p.hints[i] = hint
 }

 // LoadPtrs loads the slice pointer arguments into registers,
 // assuming that the slice length n has already been loaded
 // into the register n.
 //
 // Start will call LoadPtrs if it has not been called already.
 // LoadPtrs only needs to be called explicitly when code needs
 // to use LoadN before Start, like when the shift.go generators
 // read an initial word before the loop.
 func (p *Pipe) LoadPtrs(n Reg) {
 	a := p.f.Asm
 	if p.loaded {
 		a.Fatalf("pointers already loaded")
 	}

 	// Load the actual pointers.
 	p.loaded = true
 	for _, name := range p.f.inputs {
 		p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base")))
 	}
 	for _, name := range p.f.outputs {
 		p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base")))
 	}

 	// Decide the memory access strategy for LoadN and StoreN.
 	switch {
 	case p.backward && p.useIndexCounter:
 		// Generator wants an index counter, meaning when the iteration counter
 		// is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes).
 		// The loop is moving backward through the slice, but the counter
 		// is also moving backward, so not much to do.
 		a.Comment("run loop backward, using counter as positive index")
 		p.indexCounter = +1
 		p.index = n

 	case !p.backward && p.useIndexCounter:
 		// Generator wants an index counter, but the loop is moving forward.
 		// To make the counter move in the direction of data access,
 		// we negate the counter, counting up from -len(z) to -1.
 		// To make the index access the right words, we add len(z)*WordBytes
 		// to each of the pointers.
 		// See comment below about the garbage collector (non-)implications
 		// of pointing beyond the slice bounds.
 		a.Comment("use counter as negative index")
 		p.indexCounter = -1
 		p.index = n
 		for _, ptr := range p.inPtr {
 			a.AddWords(n, ptr, ptr)
 		}
 		for _, ptr := range p.outPtr {
 			a.AddWords(n, ptr, ptr)
 		}
 		a.Neg(n, n)

 	case p.backward:
 		// Generator wants to run the loop backward.
 		// We'll decrement the pointers before using them,
 		// so position them at the very end of the slices.
 		// If we had precise pointer information for assembly,
 		// these pointers would cause problems with the garbage collector,
 		// since they no longer point into the allocated slice,
 		// but the garbage collector ignores unexpected values in assembly stacks,
 		// and the actual slice pointers are still in the argument stack slots,
 		// so the slices won't be collected early.
 		// If we switched to the register ABI, we might have to rethink this.
 		// (The same thing happens by the end of forward loops,
 		// but it's less important since once the pointers go off the slice
 		// in a forward loop, the loop is over and the slice won't be accessed anymore.)
 		a.Comment("run loop backward")
 		for _, ptr := range p.inPtr {
 			a.AddWords(n, ptr, ptr)
 		}
 		for _, ptr := range p.outPtr {
 			a.AddWords(n, ptr, ptr)
 		}

 	case !p.backward:
 		// Nothing to do!
 	}
 }

 // LoadN returns the next n columns of input words as a slice of rows.
 // Regs for inputs that have been marked using p.SetMemOK will be direct memory references.
 // Regs for other inputs will be newly allocated registers and must be freed.
 func (p *Pipe) LoadN(n int) [][]Reg {
 	a := p.f.Asm
 	regs := make([][]Reg, len(p.inPtr))
 	for i, ptr := range p.inPtr {
 		regs[i] = make([]Reg, n)
 		switch {
 		case a.Arch.loadIncN != nil:
 			// Load from memory and advance pointers at the same time.
 			for j := range regs[i] {
 				regs[i][j] = p.f.Asm.Reg()
 			}
 			if p.backward {
 				a.Arch.loadDecN(a, ptr, regs[i])
 			} else {
 				a.Arch.loadIncN(a, ptr, regs[i])
 			}

 		default:
 			// Load from memory using offsets.
 			// We'll advance the pointers or the index counter later.
 			for j := range n {
 				off := p.readOff + j
 				if p.backward {
 					off = -(off + 1)
 				}
 				var mem Reg
 				if p.indexCounter != 0 {
 					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
 				} else {
 					mem = ptr.mem(off * a.Arch.WordBytes)
 				}
 				h := HintNone
 				if i < len(p.hints) {
 					h = p.hints[i]
 				}
 				if h == HintMemOK {
 					regs[i][j] = mem
 				} else {
 					r := p.f.Asm.RegHint(h)
 					a.Mov(mem, r)
 					regs[i][j] = r
 				}
 			}
 		}
 	}
 	p.readOff += n
 	return regs
 }

 // StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).
 func (p *Pipe) StoreN(regs [][]Reg) {
 	p.needWrite = false
 	a := p.f.Asm
 	if len(regs) != len(p.outPtr) {
 		p.f.Asm.Fatalf("wrong number of output rows")
 	}
 	n := len(regs[0])
 	for i, ptr := range p.outPtr {
 		switch {
 		case a.Arch.storeIncN != nil:
 			// Store to memory and advance pointers at the same time.
 			if p.backward {
 				a.Arch.storeDecN(a, ptr, regs[i])
 			} else {
 				a.Arch.storeIncN(a, ptr, regs[i])
 			}

 		default:
 			// Store to memory using offsets.
 			// We'll advance the pointers or the index counter later.
 			for j, r := range regs[i] {
 				off := p.writeOff + j
 				if p.backward {
 					off = -(off + 1)
 				}
 				var mem Reg
 				if p.indexCounter != 0 {
 					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
 				} else {
 					mem = ptr.mem(off * a.Arch.WordBytes)
 				}
 				a.Mov(r, mem)
 			}
 		}
 	}
 	p.writeOff += n
 }

 // advancePtrs advances the pointers by step
 // or handles bookkeeping for an imminent index advance by step
 // that the caller will do.
 func (p *Pipe) advancePtrs(step int) {
 	a := p.f.Asm
 	switch {
 	case a.Arch.loadIncN != nil:
 		// nothing to do

 	default:
 		// Adjust read/write offsets for pointer advance (or imminent index advance).
 		p.readOff -= step
 		p.writeOff -= step

 		if p.indexCounter == 0 {
 			// Advance pointers.
 			if p.backward {
 				step = -step
 			}
 			for _, ptr := range p.inPtr {
 				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
 			}
 			for _, ptr := range p.outPtr {
 				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
 			}
 		}
 	}
 }

 // DropInput deletes the named input from the pipe,
 // usually because it has been exhausted.
 // (This is not used yet but will be used in a future generator.)
 func (p *Pipe) DropInput(name string) {
 	i := slices.Index(p.f.inputs, name)
 	if i < 0 {
 		p.f.Asm.Fatalf("unknown input %s", name)
 	}
 	ptr := p.inPtr[i]
 	p.f.Asm.Free(Reg(ptr))
 	p.inPtr = slices.Delete(p.inPtr, i, i+1)
 	p.f.inputs = slices.Delete(p.f.inputs, i, i+1)
 	if len(p.hints) > i {
 		p.hints = slices.Delete(p.hints, i, i+1)
 	}
 }

 // Start prepares to loop over n columns.
 // The factors give a sequence of unrolling factors to use,
 // which must be either strictly increasing or strictly decreasing
 // and must include 1.
 // For example, 4, 1 means to process 4 elements at a time
 // and then 1 at a time for the final 0-3; specifying 1,4 instead
 // handles 0-3 elements first and then 4 at a time.
 // Similarly, 32, 4, 1 means to process 32 at a time,
 // then 4 at a time, then 1 at a time.
 //
 // One benefit of using 1, 4 instead of 4, 1 is that the body
 // processing 4 at a time needs more registers, and if it is
 // the final body, the register holding the fragment count (0-3)
 // has been freed and is available for use.
 //
 // Start may modify the carry flag.
 //
 // Start must be followed by a call to Loop1 or LoopN,
 // but it is permitted to emit other instructions first,
 // for example to set an initial carry flag.
 func (p *Pipe) Start(n Reg, factors ...int) {
 	a := p.f.Asm
 	if p.started {
 		a.Fatalf("loop already started")
 	}
 	if p.useIndexCounter && len(factors) > 1 {
 		a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors)
 	}
 	p.started = true
 	if !p.loaded {
 		if len(factors) == 1 {
 			p.SetUseIndexCounter()
 		}
 		p.LoadPtrs(n)
 	}

 	// If there were calls to LoadN between LoadPtrs and Start,
 	// adjust the loop not to scan those columns, assuming that
 	// either the code already called an equivalent StoreN or else
 	// that it will do so after the loop.
 	if off := p.readOff; off != 0 {
 		if p.indexCounter < 0 {
 			// Index is negated, so add off instead of subtracting.
 			a.Add(a.Imm(off), n, n, SmashCarry)
 		} else {
 			a.Sub(a.Imm(off), n, n, SmashCarry)
 		}
 		if p.indexCounter != 0 {
 			// n is also the index we are using, so adjust readOff and writeOff
 			// to continue to point at the same positions as before we changed n.
 			p.readOff -= off
 			p.writeOff -= off
 		}
 	}

 	p.Restart(n, factors...)
 }

 // Restart prepares to loop over an additional n columns,
 // beyond a previous loop run by p.Start/p.Loop.
 func (p *Pipe) Restart(n Reg, factors ...int) {
 	a := p.f.Asm
 	if !p.started {
 		a.Fatalf("pipe not started")
 	}
 	p.factors = factors
 	p.counts = make([]Reg, len(factors))
 	if len(factors) == 0 {
 		factors = []int{1}
 	}

 	// Compute the loop lengths for each unrolled section into separate registers.
 	// We compute them all ahead of time in case the computation would smash
 	// a carry flag that the loop bodies need preserved.
 	if len(factors) > 1 {
 		a.Comment("compute unrolled loop lengths")
 	}
 	switch {
 	default:
 		a.Fatalf("invalid factors %v", factors)

 	case factors[0] == 1:
 		// increasing loop factors
 		div := 1
 		for i, f := range factors[1:] {
 			if f <= factors[i] {
 				a.Fatalf("non-increasing factors %v", factors)
 			}
 			if f&(f-1) != 0 {
 				a.Fatalf("non-power-of-two factors %v", factors)
 			}
 			t := p.f.Asm.Reg()
 			f /= div
 			a.And(a.Imm(f-1), n, t)
 			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n)
 			div *= f
 			p.counts[i] = t
 		}
 		p.counts[len(p.counts)-1] = n

 	case factors[len(factors)-1] == 1:
 		// decreasing loop factors
 		for i, f := range factors[:len(factors)-1] {
 			if f <= factors[i+1] {
 				a.Fatalf("non-decreasing factors %v", factors)
 			}
 			if f&(f-1) != 0 {
 				a.Fatalf("non-power-of-two factors %v", factors)
 			}
 			t := p.f.Asm.Reg()
 			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t)
 			a.And(a.Imm(f-1), n, n)
 			p.counts[i] = t
 		}
 		p.counts[len(p.counts)-1] = n
 	}
 }

 // Done frees all the registers allocated by the pipe.
 func (p *Pipe) Done() {
 	for _, ptr := range p.inPtr {
 		p.f.Asm.Free(Reg(ptr))
 	}
 	p.inPtr = nil
 	for _, ptr := range p.outPtr {
 		p.f.Asm.Free(Reg(ptr))
 	}
 	p.outPtr = nil
 	p.index = Reg{}
 }

 // Loop emits code for the loop, calling block repeatedly to emit code that
 // handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).
 // block must call p.StoreN(out) to write N output columns.
 // The out slice is a pre-allocated matrix of uninitialized Reg values.
 // block is expected to set each entry to the Reg that should be written
 // before calling p.StoreN(out).
 //
 // For example, if the loop is to be unrolled 4x in blocks of 2 columns each,
 // the sequence of calls to emit the unrolled loop body is:
 //
 //	start()  // set by pAtUnrollStart
 //	... reads for 2 columns ...
 //	block()
 //	... writes for 2 columns ...
 //	... reads for 2 columns ...
 //	block()
 //	... writes for 2 columns ...
 //	end()  // set by p.AtUnrollEnd
 //
 // Any registers allocated during block are freed automatically when block returns.
 func (p *Pipe) Loop(block func(in, out [][]Reg)) {
 	if p.factors == nil {
 		p.f.Asm.Fatalf("Pipe.Start not called")
 	}
 	for i, factor := range p.factors {
 		n := p.counts[i]
 		p.unroll(n, factor, block)
 		if i < len(p.factors)-1 {
 			p.f.Asm.Free(n)
 		}
 	}
 	p.factors = nil
 }

 // AtUnrollStart sets a function to call at the start of an unrolled sequence.
 // See [Pipe.Loop] for details.
 func (p *Pipe) AtUnrollStart(start func()) {
 	p.unrollStart = start
 }

 // AtUnrollEnd sets a function to call at the end of an unrolled sequence.
 // See [Pipe.Loop] for details.
 func (p *Pipe) AtUnrollEnd(end func()) {
 	p.unrollEnd = end
 }

 // unroll emits a single unrolled loop for the given factor, iterating n times.
 func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) {
 	a := p.f.Asm
 	label := fmt.Sprintf("%s%d", p.label, factor)

 	// Top of loop control flow.
 	a.Label(label)
 	if a.Arch.loopTop != "" {
 		a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done")
 	} else {
 		a.JmpZero(n, label+"done")
 	}
 	a.Label(label + "cont")

 	// Unrolled loop body.
 	if factor < p.maxColumns {
 		a.Comment("unroll %dX", factor)
 	} else {
 		a.Comment("unroll %dX in batches of %d", factor, p.maxColumns)
 	}
 	if p.unrollStart != nil {
 		p.unrollStart()
 	}
 	for done := 0; done < factor; {
 		batch := min(factor-done, p.maxColumns)
 		regs := a.RegsUsed()
 		out := make([][]Reg, len(p.outPtr))
 		for i := range out {
 			out[i] = make([]Reg, batch)
 		}
 		in := p.LoadN(batch)
 		p.needWrite = true
 		block(in, out)
 		if p.needWrite && len(p.outPtr) > 0 {
 			a.Fatalf("missing p.Write1 or p.StoreN")
 		}
 		a.SetRegsUsed(regs) // free anything block allocated
 		done += batch
 	}
 	if p.unrollEnd != nil {
 		p.unrollEnd()
 	}
 	p.advancePtrs(factor)

 	// Bottom of loop control flow.
 	switch {
 	case p.indexCounter >= 0 && a.Arch.loopBottom != "":
 		a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont")

 	case p.indexCounter >= 0:
 		a.Sub(a.Imm(1), n, n, KeepCarry)
 		a.JmpNonZero(n, label+"cont")

 	case p.indexCounter < 0 && a.Arch.loopBottomNeg != "":
 		a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont")

 	case p.indexCounter < 0:
 		a.Add(a.Imm(1), n, n, KeepCarry)
 	}
 	a.Label(label + "done")
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package asmgen

	import (
	"fmt"
	"math/bits"
	"slices"
	)

	// Note: Exported fields and methods are expected to be used
	// by function generators (like the ones in add.go and so on).
	// Unexported fields and methods should not be.

	// A Pipe manages the input and output data pipelines for a function's
	// memory operations.
	//
	// The input is one or more equal-length slices of words, so collectively
	// it can be viewed as a matrix, in which each slice is a row and each column
	// is a set of corresponding words from the different slices.
	// The output can be viewed the same way, although it is often just one row.
	type Pipe struct {
	f *Func // function being generated
	label string // prefix for loop labels (default "loop")
	backward bool // processing columns in reverse
	started bool // Start has been called
	loaded bool // LoadPtrs has been called
	inPtr []RegPtr // input slice pointers
	hints []Hint // for each inPtr, a register hint to use for its data
	outPtr []RegPtr // output slice pointers
	index Reg // index register, if in use
	useIndexCounter bool // index counter requested
	indexCounter int // index is also counter (386); 0 no, -1 negative counter, +1 positive counter
	readOff int // read offset not yet added to index
	writeOff int // write offset not yet added to index
	factors []int // unrolling factors
	counts []Reg // iterations for each factor
	needWrite bool // need a write call during Loop1/LoopN
	maxColumns int // maximum columns during unrolled loop
	unrollStart func() // emit code at start of unrolled body
	unrollEnd func() // emit code end of unrolled body
	}

	// Pipe creates and returns a new pipe for use in the function f.
	func (f Func) Pipe() Pipe {
	a := f.Asm
	p := &Pipe{
	f: f,
	label: "loop",
	maxColumns: 10000000,
	}
	if m := a.Arch.maxColumns; m != 0 {
	p.maxColumns = m
	}
	return p
	}

	// SetBackward sets the pipe to process the input and output columns in reverse order.
	// This is needed for left shifts, which might otherwise overwrite data they will read later.
	func (p *Pipe) SetBackward() {
	if p.loaded {
	p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs")
	}
	p.backward = true
	}

	// SetUseIndexCounter sets the pipe to use an index counter if possible,
	// meaning the loop counter is also used as an index for accessing the slice data.
	// This clever trick is slower on modern processors, but it is still necessary on 386.
	// On non-386 systems, SetUseIndexCounter is a no-op.
	func (p *Pipe) SetUseIndexCounter() {
	if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)
	return
	}
	p.useIndexCounter = true
	}

	// SetLabel sets the label prefix for the loops emitted by the pipe.
	// The default prefix is "loop".
	func (p *Pipe) SetLabel(label string) {
	p.label = label
	}

	// SetMaxColumns sets the maximum number of
	// columns processed in a single loop body call.
	func (p *Pipe) SetMaxColumns(m int) {
	p.maxColumns = m
	}

	// SetHint records that the inputs from the named vector
	// should be allocated with the given register hint.
	//
	// If the hint indicates a single register on the target architecture,
	// then SetHint calls SetMaxColumns(1), since the hinted register
	// can only be used for one value at a time.
	func (p *Pipe) SetHint(name string, hint Hint) {
	if hint == HintMemOK && !p.f.Asm.Arch.memOK {
	return
	}
	i := slices.Index(p.f.inputs, name)
	if i < 0 {
	p.f.Asm.Fatalf("unknown input name %s", name)
	}
	if p.f.Asm.hint(hint) != "" {
	p.SetMaxColumns(1)
	}
	for len(p.hints) <= i {
	p.hints = append(p.hints, HintNone)
	}
	p.hints[i] = hint
	}

	// LoadPtrs loads the slice pointer arguments into registers,
	// assuming that the slice length n has already been loaded
	// into the register n.
	//
	// Start will call LoadPtrs if it has not been called already.
	// LoadPtrs only needs to be called explicitly when code needs
	// to use LoadN before Start, like when the shift.go generators
	// read an initial word before the loop.
	func (p *Pipe) LoadPtrs(n Reg) {
	a := p.f.Asm
	if p.loaded {
	a.Fatalf("pointers already loaded")
	}

	// Load the actual pointers.
	p.loaded = true
	for _, name := range p.f.inputs {
	p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base")))
	}
	for _, name := range p.f.outputs {
	p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base")))
	}

	// Decide the memory access strategy for LoadN and StoreN.
	switch {
	case p.backward && p.useIndexCounter:
	// Generator wants an index counter, meaning when the iteration counter
	// is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes).
	// The loop is moving backward through the slice, but the counter
	// is also moving backward, so not much to do.
	a.Comment("run loop backward, using counter as positive index")
	p.indexCounter = +1
	p.index = n

	case !p.backward && p.useIndexCounter:
	// Generator wants an index counter, but the loop is moving forward.
	// To make the counter move in the direction of data access,
	// we negate the counter, counting up from -len(z) to -1.
	// To make the index access the right words, we add len(z)*WordBytes
	// to each of the pointers.
	// See comment below about the garbage collector (non-)implications
	// of pointing beyond the slice bounds.
	a.Comment("use counter as negative index")
	p.indexCounter = -1
	p.index = n
	for _, ptr := range p.inPtr {
	a.AddWords(n, ptr, ptr)
	}
	for _, ptr := range p.outPtr {
	a.AddWords(n, ptr, ptr)
	}
	a.Neg(n, n)

	case p.backward:
	// Generator wants to run the loop backward.
	// We'll decrement the pointers before using them,
	// so position them at the very end of the slices.
	// If we had precise pointer information for assembly,
	// these pointers would cause problems with the garbage collector,
	// since they no longer point into the allocated slice,
	// but the garbage collector ignores unexpected values in assembly stacks,
	// and the actual slice pointers are still in the argument stack slots,
	// so the slices won't be collected early.
	// If we switched to the register ABI, we might have to rethink this.
	// (The same thing happens by the end of forward loops,
	// but it's less important since once the pointers go off the slice
	// in a forward loop, the loop is over and the slice won't be accessed anymore.)
	a.Comment("run loop backward")
	for _, ptr := range p.inPtr {
	a.AddWords(n, ptr, ptr)
	}
	for _, ptr := range p.outPtr {
	a.AddWords(n, ptr, ptr)
	}

	case !p.backward:
	// Nothing to do!
	}
	}

	// LoadN returns the next n columns of input words as a slice of rows.
	// Regs for inputs that have been marked using p.SetMemOK will be direct memory references.
	// Regs for other inputs will be newly allocated registers and must be freed.
	func (p *Pipe) LoadN(n int) [][]Reg {
	a := p.f.Asm
	regs := make([][]Reg, len(p.inPtr))
	for i, ptr := range p.inPtr {
	regs[i] = make([]Reg, n)
	switch {
	case a.Arch.loadIncN != nil:
	// Load from memory and advance pointers at the same time.
	for j := range regs[i] {
	regs[i][j] = p.f.Asm.Reg()
	}
	if p.backward {
	a.Arch.loadDecN(a, ptr, regs[i])
	} else {
	a.Arch.loadIncN(a, ptr, regs[i])
	}

	default:
	// Load from memory using offsets.
	// We'll advance the pointers or the index counter later.
	for j := range n {
	off := p.readOff + j
	if p.backward {
	off = -(off + 1)
	}
	var mem Reg
	if p.indexCounter != 0 {
	mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
	} else {
	mem = ptr.mem(off * a.Arch.WordBytes)
	}
	h := HintNone
	if i < len(p.hints) {
	h = p.hints[i]
	}
	if h == HintMemOK {
	regs[i][j] = mem
	} else {
	r := p.f.Asm.RegHint(h)
	a.Mov(mem, r)
	regs[i][j] = r
	}
	}
	}
	}
	p.readOff += n
	return regs
	}

	// StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).
	func (p *Pipe) StoreN(regs [][]Reg) {
	p.needWrite = false
	a := p.f.Asm
	if len(regs) != len(p.outPtr) {
	p.f.Asm.Fatalf("wrong number of output rows")
	}
	n := len(regs[0])
	for i, ptr := range p.outPtr {
	switch {
	case a.Arch.storeIncN != nil:
	// Store to memory and advance pointers at the same time.
	if p.backward {
	a.Arch.storeDecN(a, ptr, regs[i])
	} else {
	a.Arch.storeIncN(a, ptr, regs[i])
	}

	default:
	// Store to memory using offsets.
	// We'll advance the pointers or the index counter later.
	for j, r := range regs[i] {
	off := p.writeOff + j
	if p.backward {
	off = -(off + 1)
	}
	var mem Reg
	if p.indexCounter != 0 {
	mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
	} else {
	mem = ptr.mem(off * a.Arch.WordBytes)
	}
	a.Mov(r, mem)
	}
	}
	}
	p.writeOff += n
	}

	// advancePtrs advances the pointers by step
	// or handles bookkeeping for an imminent index advance by step
	// that the caller will do.
	func (p *Pipe) advancePtrs(step int) {
	a := p.f.Asm
	switch {
	case a.Arch.loadIncN != nil:
	// nothing to do

	default:
	// Adjust read/write offsets for pointer advance (or imminent index advance).
	p.readOff -= step
	p.writeOff -= step

	if p.indexCounter == 0 {
	// Advance pointers.
	if p.backward {
	step = -step
	}
	for _, ptr := range p.inPtr {
	a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
	}
	for _, ptr := range p.outPtr {
	a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
	}
	}
	}
	}

	// DropInput deletes the named input from the pipe,
	// usually because it has been exhausted.
	// (This is not used yet but will be used in a future generator.)
	func (p *Pipe) DropInput(name string) {
	i := slices.Index(p.f.inputs, name)
	if i < 0 {
	p.f.Asm.Fatalf("unknown input %s", name)
	}
	ptr := p.inPtr[i]
	p.f.Asm.Free(Reg(ptr))
	p.inPtr = slices.Delete(p.inPtr, i, i+1)
	p.f.inputs = slices.Delete(p.f.inputs, i, i+1)
	if len(p.hints) > i {
	p.hints = slices.Delete(p.hints, i, i+1)
	}
	}

	// Start prepares to loop over n columns.
	// The factors give a sequence of unrolling factors to use,
	// which must be either strictly increasing or strictly decreasing
	// and must include 1.
	// For example, 4, 1 means to process 4 elements at a time
	// and then 1 at a time for the final 0-3; specifying 1,4 instead
	// handles 0-3 elements first and then 4 at a time.
	// Similarly, 32, 4, 1 means to process 32 at a time,
	// then 4 at a time, then 1 at a time.
	//
	// One benefit of using 1, 4 instead of 4, 1 is that the body
	// processing 4 at a time needs more registers, and if it is
	// the final body, the register holding the fragment count (0-3)
	// has been freed and is available for use.
	//
	// Start may modify the carry flag.
	//
	// Start must be followed by a call to Loop1 or LoopN,
	// but it is permitted to emit other instructions first,
	// for example to set an initial carry flag.
	func (p *Pipe) Start(n Reg, factors ...int) {
	a := p.f.Asm
	if p.started {
	a.Fatalf("loop already started")
	}
	if p.useIndexCounter && len(factors) > 1 {
	a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors)
	}
	p.started = true
	if !p.loaded {
	if len(factors) == 1 {
	p.SetUseIndexCounter()
	}
	p.LoadPtrs(n)
	}

	// If there were calls to LoadN between LoadPtrs and Start,
	// adjust the loop not to scan those columns, assuming that
	// either the code already called an equivalent StoreN or else
	// that it will do so after the loop.
	if off := p.readOff; off != 0 {
	if p.indexCounter < 0 {
	// Index is negated, so add off instead of subtracting.
	a.Add(a.Imm(off), n, n, SmashCarry)
	} else {
	a.Sub(a.Imm(off), n, n, SmashCarry)
	}
	if p.indexCounter != 0 {
	// n is also the index we are using, so adjust readOff and writeOff
	// to continue to point at the same positions as before we changed n.
	p.readOff -= off
	p.writeOff -= off
	}
	}

	p.Restart(n, factors...)
	}

	// Restart prepares to loop over an additional n columns,
	// beyond a previous loop run by p.Start/p.Loop.
	func (p *Pipe) Restart(n Reg, factors ...int) {
	a := p.f.Asm
	if !p.started {
	a.Fatalf("pipe not started")
	}
	p.factors = factors
	p.counts = make([]Reg, len(factors))
	if len(factors) == 0 {
	factors = []int{1}
	}

	// Compute the loop lengths for each unrolled section into separate registers.
	// We compute them all ahead of time in case the computation would smash
	// a carry flag that the loop bodies need preserved.
	if len(factors) > 1 {
	a.Comment("compute unrolled loop lengths")
	}
	switch {
	default:
	a.Fatalf("invalid factors %v", factors)

	case factors[0] == 1:
	// increasing loop factors
	div := 1
	for i, f := range factors[1:] {
	if f <= factors[i] {
	a.Fatalf("non-increasing factors %v", factors)
	}
	if f&(f-1) != 0 {
	a.Fatalf("non-power-of-two factors %v", factors)
	}
	t := p.f.Asm.Reg()
	f /= div
	a.And(a.Imm(f-1), n, t)
	a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n)
	div *= f
	p.counts[i] = t
	}
	p.counts[len(p.counts)-1] = n

	case factors[len(factors)-1] == 1:
	// decreasing loop factors
	for i, f := range factors[:len(factors)-1] {
	if f <= factors[i+1] {
	a.Fatalf("non-decreasing factors %v", factors)
	}
	if f&(f-1) != 0 {
	a.Fatalf("non-power-of-two factors %v", factors)
	}
	t := p.f.Asm.Reg()
	a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t)
	a.And(a.Imm(f-1), n, n)
	p.counts[i] = t
	}
	p.counts[len(p.counts)-1] = n
	}
	}

	// Done frees all the registers allocated by the pipe.
	func (p *Pipe) Done() {
	for _, ptr := range p.inPtr {
	p.f.Asm.Free(Reg(ptr))
	}
	p.inPtr = nil
	for _, ptr := range p.outPtr {
	p.f.Asm.Free(Reg(ptr))
	}
	p.outPtr = nil
	p.index = Reg{}
	}

	// Loop emits code for the loop, calling block repeatedly to emit code that
	// handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).
	// block must call p.StoreN(out) to write N output columns.
	// The out slice is a pre-allocated matrix of uninitialized Reg values.
	// block is expected to set each entry to the Reg that should be written
	// before calling p.StoreN(out).
	//
	// For example, if the loop is to be unrolled 4x in blocks of 2 columns each,
	// the sequence of calls to emit the unrolled loop body is:
	//
	// start() // set by pAtUnrollStart
	// ... reads for 2 columns ...
	// block()
	// ... writes for 2 columns ...
	// ... reads for 2 columns ...
	// block()
	// ... writes for 2 columns ...
	// end() // set by p.AtUnrollEnd
	//
	// Any registers allocated during block are freed automatically when block returns.
	func (p *Pipe) Loop(block func(in, out [][]Reg)) {
	if p.factors == nil {
	p.f.Asm.Fatalf("Pipe.Start not called")
	}
	for i, factor := range p.factors {
	n := p.counts[i]
	p.unroll(n, factor, block)
	if i < len(p.factors)-1 {
	p.f.Asm.Free(n)
	}
	}
	p.factors = nil
	}

	// AtUnrollStart sets a function to call at the start of an unrolled sequence.
	// See [Pipe.Loop] for details.
	func (p *Pipe) AtUnrollStart(start func()) {
	p.unrollStart = start
	}

	// AtUnrollEnd sets a function to call at the end of an unrolled sequence.
	// See [Pipe.Loop] for details.
	func (p *Pipe) AtUnrollEnd(end func()) {
	p.unrollEnd = end
	}

	// unroll emits a single unrolled loop for the given factor, iterating n times.
	func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) {
	a := p.f.Asm
	label := fmt.Sprintf("%s%d", p.label, factor)

	// Top of loop control flow.
	a.Label(label)
	if a.Arch.loopTop != "" {
	a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done")
	} else {
	a.JmpZero(n, label+"done")
	}
	a.Label(label + "cont")

	// Unrolled loop body.
	if factor < p.maxColumns {
	a.Comment("unroll %dX", factor)
	} else {
	a.Comment("unroll %dX in batches of %d", factor, p.maxColumns)
	}
	if p.unrollStart != nil {
	p.unrollStart()
	}
	for done := 0; done < factor; {
	batch := min(factor-done, p.maxColumns)
	regs := a.RegsUsed()
	out := make([][]Reg, len(p.outPtr))
	for i := range out {
	out[i] = make([]Reg, batch)
	}
	in := p.LoadN(batch)
	p.needWrite = true
	block(in, out)
	if p.needWrite && len(p.outPtr) > 0 {
	a.Fatalf("missing p.Write1 or p.StoreN")
	}
	a.SetRegsUsed(regs) // free anything block allocated
	done += batch
	}
	if p.unrollEnd != nil {
	p.unrollEnd()
	}
	p.advancePtrs(factor)

	// Bottom of loop control flow.
	switch {
	case p.indexCounter >= 0 && a.Arch.loopBottom != "":
	a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont")

	case p.indexCounter >= 0:
	a.Sub(a.Imm(1), n, n, KeepCarry)
	a.JmpNonZero(n, label+"cont")

	case p.indexCounter < 0 && a.Arch.loopBottomNeg != "":
	a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont")

	case p.indexCounter < 0:
	a.Add(a.Imm(1), n, n, KeepCarry)
	}
	a.Label(label + "done")
	}