src/cmd/compile/internal/ssa/pair.go - go.git - Git at Google

 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package ssa

 import (
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"slices"
 )

 // The pair pass finds memory operations that can be paired up
 // into single 2-register memory instructions.
 func pair(f *Func) {
 	// Only arm64 for now. This pass is fairly arch-specific.
 	switch f.Config.arch {
 	case "arm64":
 	default:
 		return
 	}
 	pairLoads(f)
 	pairStores(f)
 }

 type pairableLoadInfo struct {
 	width int64 // width of one element in the pair, in bytes
 	pair  Op
 }

 // All pairableLoad ops must take 2 arguments, a pointer and a memory.
 // They must also take an offset in Aux/AuxInt.
 var pairableLoads = map[Op]pairableLoadInfo{
 	OpARM64MOVDload:  {8, OpARM64LDP},
 	OpARM64MOVWUload: {4, OpARM64LDPW},
 	OpARM64MOVWload:  {4, OpARM64LDPSW},
 	// TODO: conceivably we could pair a signed and unsigned load
 	// if we knew the upper bits of one of them weren't being used.
 	OpARM64FMOVDload: {8, OpARM64FLDPD},
 	OpARM64FMOVSload: {4, OpARM64FLDPS},
 }

 type pairableStoreInfo struct {
 	width int64 // width of one element in the pair, in bytes
 	pair  Op
 }

 // All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
 // All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
 // They must also take an offset in Aux/AuxInt.
 var pairableStores = map[Op]pairableStoreInfo{
 	OpARM64MOVDstore:  {8, OpARM64STP},
 	OpARM64MOVWstore:  {4, OpARM64STPW},
 	OpARM64FMOVDstore: {8, OpARM64FSTPD},
 	OpARM64FMOVSstore: {4, OpARM64FSTPS},
 }

 // offsetOk returns true if a pair instruction should be used
 // for the offset Aux+off, when the data width (of the
 // unpaired instructions) is width.
 // This function is best-effort. The compiled function must
 // still work if offsetOk always returns true.
 // TODO: this is currently arm64-specific.
 func offsetOk(aux Aux, off, width int64) bool {
 	if true {
 		// Seems to generate slightly smaller code if we just
 		// always allow this rewrite.
 		//
 		// Without pairing, we have 2 load instructions, like:
 		//   LDR 88(R0), R1
 		//   LDR 96(R0), R2
 		// with pairing we have, best case:
 		//   LDP 88(R0), R1, R2
 		// but maybe we need an adjuster if out of range or unaligned:
 		//   ADD R0, $88, R27
 		//   LDP (R27), R1, R2
 		// Even with the adjuster, it is at least no worse.
 		//
 		// A similar situation occurs when accessing globals.
 		// Two loads from globals requires 4 instructions,
 		// two ADRP and two LDR. With pairing, we need
 		// ADRP+ADD+LDP, three instructions.
 		//
 		// With pairing, it looks like the critical path might
 		// be a little bit longer. But it should never be more
 		// instructions.
 		// TODO: see if that longer critical path causes any
 		// regressions.
 		return true
 	}
 	if aux != nil {
 		if _, ok := aux.(*ir.Name); !ok {
 			// Offset is probably too big (globals).
 			return false
 		}
 		// We let *ir.Names pass here, as
 		// they are probably small offsets from SP.
 		// There's no guarantee that we're in range
 		// in that case though (we don't know the
 		// stack frame size yet), so the assembler
 		// might need to issue fixup instructions.
 		// Assume some small frame size.
 		if off >= 0 {
 			off += 120
 		}
 		// TODO: figure out how often this helps vs. hurts.
 	}
 	switch width {
 	case 4:
 		if off >= -256 && off <= 252 && off%4 == 0 {
 			return true
 		}
 	case 8:
 		if off >= -512 && off <= 504 && off%8 == 0 {
 			return true
 		}
 	}
 	return false
 }

 func pairLoads(f *Func) {
 	var loads []*Value

 	// Registry of aux values for sorting.
 	auxIDs := map[Aux]int{}
 	auxID := func(aux Aux) int {
 		id, ok := auxIDs[aux]
 		if !ok {
 			id = len(auxIDs)
 			auxIDs[aux] = id
 		}
 		return id
 	}

 	for _, b := range f.Blocks {
 		// Find loads.
 		loads = loads[:0]
 		clear(auxIDs)
 		for _, v := range b.Values {
 			info := pairableLoads[v.Op]
 			if info.width == 0 {
 				continue // not pairable
 			}
 			if !offsetOk(v.Aux, v.AuxInt, info.width) {
 				continue // not advisable
 			}
 			loads = append(loads, v)
 		}
 		if len(loads) < 2 {
 			continue
 		}

 		// Sort to put pairable loads together.
 		slices.SortFunc(loads, func(x, y *Value) int {
 			// First sort by op, ptr, and memory arg.
 			if x.Op != y.Op {
 				return int(x.Op - y.Op)
 			}
 			if x.Args[0].ID != y.Args[0].ID {
 				return int(x.Args[0].ID - y.Args[0].ID)
 			}
 			if x.Args[1].ID != y.Args[1].ID {
 				return int(x.Args[1].ID - y.Args[1].ID)
 			}
 			// Then sort by aux. (nil first, then by aux ID)
 			if x.Aux != nil {
 				if y.Aux == nil {
 					return 1
 				}
 				a, b := auxID(x.Aux), auxID(y.Aux)
 				if a != b {
 					return a - b
 				}
 			} else if y.Aux != nil {
 				return -1
 			}
 			// Then sort by offset, low to high.
 			return int(x.AuxInt - y.AuxInt)
 		})

 		// Look for pairable loads.
 		for i := 0; i < len(loads)-1; i++ {
 			x := loads[i]
 			y := loads[i+1]
 			if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
 				continue
 			}
 			if x.Aux != y.Aux {
 				continue
 			}
 			if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
 				continue
 			}

 			// Commit point.

 			// Make the 2-register load.
 			load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])

 			// Modify x to be (Select0 load). Similar for y.
 			x.reset(OpSelect0)
 			x.SetArgs1(load)
 			y.reset(OpSelect1)
 			y.SetArgs1(load)

 			i++ // Skip y next time around the loop.
 		}
 	}

 	// Try to pair a load with a load from a subsequent block.
 	// Note that this is always safe to do if the memory arguments match.
 	// (But see the memory barrier case below.)
 	type nextBlockKey struct {
 		op     Op
 		ptr    ID
 		mem    ID
 		auxInt int64
 		aux    any
 	}
 	nextBlock := map[nextBlockKey]*Value{}
 	for _, b := range f.Blocks {
 		if memoryBarrierTest(b) {
 			// TODO: Do we really need to skip write barrier test blocks?
 			//     type T struct {
 			//         a *byte
 			//         b int
 			//     }
 			//     func f(t *T) int {
 			//         r := t.b
 			//         t.a = nil
 			//         return r
 			//     }
 			// This would issue a single LDP for both the t.a and t.b fields,
 			// *before* we check the write barrier flag. (We load the t.a field
 			// to put it in the write barrier buffer.) Not sure if that is ok.
 			continue
 		}
 		// Find loads in the next block(s) that we can move to this one.
 		// TODO: could maybe look further than just one successor hop.
 		clear(nextBlock)
 		for _, e := range b.Succs {
 			if len(e.b.Preds) > 1 {
 				continue
 			}
 			for _, v := range e.b.Values {
 				info := pairableLoads[v.Op]
 				if info.width == 0 {
 					continue
 				}
 				if !offsetOk(v.Aux, v.AuxInt, info.width) {
 					continue // not advisable
 				}
 				nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v
 			}
 		}
 		if len(nextBlock) == 0 {
 			continue
 		}
 		// don't move too many loads. Each requires a register across a basic block boundary.
 		const maxMoved = 4
 		nMoved := 0
 		for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- {
 			x := b.Values[i]
 			info := pairableLoads[x.Op]
 			if info.width == 0 {
 				continue
 			}
 			if !offsetOk(x.Aux, x.AuxInt, info.width) {
 				continue // not advisable
 			}
 			key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux}
 			if y := nextBlock[key]; y != nil {
 				delete(nextBlock, key)

 				// Make the 2-register load.
 				load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])

 				// Modify x to be (Select0 load).
 				x.reset(OpSelect0)
 				x.SetArgs1(load)
 				// Modify y to be (Copy (Select1 load)).
 				// Note: the Select* needs to live in the load's block, not y's block.
 				y.reset(OpCopy)
 				y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load))
 				nMoved++
 				continue
 			}
 			key.auxInt = x.AuxInt - info.width
 			if y := nextBlock[key]; y != nil {
 				delete(nextBlock, key)

 				// Make the 2-register load.
 				load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1])

 				// Modify x to be (Select1 load).
 				x.reset(OpSelect1)
 				x.SetArgs1(load)
 				// Modify y to be (Copy (Select0 load)).
 				y.reset(OpCopy)
 				y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load))
 				nMoved++
 				continue
 			}
 		}
 	}
 }

 func memoryBarrierTest(b *Block) bool {
 	if b.Kind != BlockARM64NZW {
 		return false
 	}
 	c := b.Controls[0]
 	if c.Op != OpARM64MOVWUload {
 		return false
 	}
 	if globl, ok := c.Aux.(*obj.LSym); ok {
 		return globl.Name == "runtime.writeBarrier"
 	}
 	return false
 }

 // pairStores merges store instructions.
 // It collects stores into a buffer where they can be freely reordered.
 // When encountering an instruction that cannot be added to the buffer,
 // it pairs the accumulated stores, flushes the buffer, and continues processing.
 func pairStores(f *Func) {
 	last := f.Cache.allocBoolSlice(f.NumValues())
 	defer f.Cache.freeBoolSlice(last)

 	// memChain contains a list of stores with the same ptr/aux pair and
 	// nonoverlapping write ranges [AuxInt:AuxInt+writeSize]. All of the
 	// elements of memChain can be reordered with each other.
 	memChain := []*Value{}

 	// Limit of length of memChain array.
 	// This keeps us in O(n) territory.
 	limit := 100

 	// flushMemChain sorts the stores in memChain and merges them when possible.
 	// Then it flushes memChain.
 	flushMemChain := func() {
 		if len(memChain) < 2 {
 			memChain = memChain[:0]
 			return
 		}

 		// Sort in increasing AuxInt to put pairable stores together.
 		slices.SortFunc(memChain, func(x, y *Value) int {
 			return int(x.AuxInt - y.AuxInt)
 		})

 		lastIdx := len(memChain) - 1
 		for i := 0; i < lastIdx; i++ {
 			v := memChain[i]
 			w := memChain[i+1]
 			info := pairableStores[v.Op]

 			off := v.AuxInt
 			mem := v.MemoryArg()
 			aux := v.Aux
 			pos := v.Pos
 			wmem := w.MemoryArg()

 			if w.Op == v.Op && w.AuxInt == off+info.width {
 				// Arguments for the merged store: ptr, val1, val2, mem.
 				args := []*Value{v.Args[0], v.Args[1], w.Args[1], mem}

 				v.reset(info.pair)
 				v.AddArgs(args...)
 				v.Aux = aux
 				v.AuxInt = off
 				v.Pos = pos

 				// Make w just a memory copy.
 				w.reset(OpCopy)
 				w.SetArgs1(wmem)

 				// Skip merged store (w)
 				i++
 			}
 		}

 		memChain = memChain[:0]
 	}

 	// prevStore returns the previous store in the
 	// same block, or nil if there are none.
 	prevStore := func(v *Value) *Value {
 		if v.Op == OpInitMem || v.Op == OpPhi {
 			return nil
 		}
 		m := v.MemoryArg()
 		if m.Block != v.Block {
 			return nil
 		}
 		return m
 	}

 	// storeWidth returns the width of store,
 	// or 0 if it is not a store this pass understands.
 	storeWidth := func(op Op) int64 {
 		var width int64
 		switch op {
 		case OpARM64MOVDstore, OpARM64FMOVDstore:
 			width = 8
 		case OpARM64MOVWstore, OpARM64FMOVSstore:
 			width = 4
 		case OpARM64MOVHstore:
 			width = 2
 		case OpARM64MOVBstore:
 			width = 1
 		default:
 			width = 0
 		}
 		return width
 	}

 	for _, b := range f.Blocks {
 		memChain = memChain[:0]

 		// Find last store in block, so we can
 		// walk the stores last to first.
 		// Last to first helps ensure that the rewrites we
 		// perform do not get in the way of subsequent rewrites.
 		for _, v := range b.Values {
 			if v.Type.IsMemory() {
 				last[v.ID] = true
 			}
 		}
 		for _, v := range b.Values {
 			if v.Type.IsMemory() {
 				if m := prevStore(v); m != nil {
 					last[m.ID] = false
 				}
 			}
 		}
 		var lastMem *Value
 		for _, v := range b.Values {
 			if last[v.ID] {
 				lastMem = v
 				break
 			}
 		}

 		// Iterate over memory stores, accumulating them in memChain for potential merging.
 		// Flush the chain when reordering is unsafe or a conflict is detected.
 		for v := lastMem; v != nil; v = prevStore(v) {
 			writeSize := storeWidth(v.Op)

 			if writeSize == 0 {
 				// We can't reorder stores with calls or other instructions
 				// with writeSize == 0.
 				flushMemChain()
 				continue
 			}
 			if v.Uses != 1 && len(memChain) > 0 ||
 				len(memChain) > 0 && (v.Args[0] != memChain[0].Args[0] || v.Aux != memChain[0].Aux) ||
 				len(memChain) == limit {
 				// If v has multiple uses and it is not the latest store in the chain,
 				// we cannot merge it with other store instructions.
 				// If v has a different base pointer or Aux value from the current chain,
 				// we need to flush memChain and start a new one with v.
 				// If memChain length limit is exceeded, we also need to flush the chain
 				// and start a new one with v.
 				// Only look back so far.
 				// This keeps us in O(n) territory, and it
 				// also prevents us from keeping values
 				// in registers for too long (and thus
 				// needing to spill them).
 				flushMemChain()
 			}

 			for _, w := range memChain {
 				wWriteSize := storeWidth(w.Op)
 				if overlap(w.AuxInt, wWriteSize, v.AuxInt, writeSize) {
 					// Aliases with w's location.
 					// Flush the chain and start a new one with v.
 					flushMemChain()
 					break
 				}
 			}

 			memChain = append(memChain, v)
 		}
 		flushMemChain()
 	}
 }
	// Copyright 2016 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package ssa

	import (
	"cmd/compile/internal/ir"
	"cmd/compile/internal/types"
	"cmd/internal/obj"
	"slices"
	)

	// The pair pass finds memory operations that can be paired up
	// into single 2-register memory instructions.
	func pair(f *Func) {
	// Only arm64 for now. This pass is fairly arch-specific.
	switch f.Config.arch {
	case "arm64":
	default:
	return
	}
	pairLoads(f)
	pairStores(f)
	}

	type pairableLoadInfo struct {
	width int64 // width of one element in the pair, in bytes
	pair Op
	}

	// All pairableLoad ops must take 2 arguments, a pointer and a memory.
	// They must also take an offset in Aux/AuxInt.
	var pairableLoads = map[Op]pairableLoadInfo{
	OpARM64MOVDload: {8, OpARM64LDP},
	OpARM64MOVWUload: {4, OpARM64LDPW},
	OpARM64MOVWload: {4, OpARM64LDPSW},
	// TODO: conceivably we could pair a signed and unsigned load
	// if we knew the upper bits of one of them weren't being used.
	OpARM64FMOVDload: {8, OpARM64FLDPD},
	OpARM64FMOVSload: {4, OpARM64FLDPS},
	}

	type pairableStoreInfo struct {
	width int64 // width of one element in the pair, in bytes
	pair Op
	}

	// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
	// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
	// They must also take an offset in Aux/AuxInt.
	var pairableStores = map[Op]pairableStoreInfo{
	OpARM64MOVDstore: {8, OpARM64STP},
	OpARM64MOVWstore: {4, OpARM64STPW},
	OpARM64FMOVDstore: {8, OpARM64FSTPD},
	OpARM64FMOVSstore: {4, OpARM64FSTPS},
	}

	// offsetOk returns true if a pair instruction should be used
	// for the offset Aux+off, when the data width (of the
	// unpaired instructions) is width.
	// This function is best-effort. The compiled function must
	// still work if offsetOk always returns true.
	// TODO: this is currently arm64-specific.
	func offsetOk(aux Aux, off, width int64) bool {
	if true {
	// Seems to generate slightly smaller code if we just
	// always allow this rewrite.
	//
	// Without pairing, we have 2 load instructions, like:
	// LDR 88(R0), R1
	// LDR 96(R0), R2
	// with pairing we have, best case:
	// LDP 88(R0), R1, R2
	// but maybe we need an adjuster if out of range or unaligned:
	// ADD R0, $88, R27
	// LDP (R27), R1, R2
	// Even with the adjuster, it is at least no worse.
	//
	// A similar situation occurs when accessing globals.
	// Two loads from globals requires 4 instructions,
	// two ADRP and two LDR. With pairing, we need
	// ADRP+ADD+LDP, three instructions.
	//
	// With pairing, it looks like the critical path might
	// be a little bit longer. But it should never be more
	// instructions.
	// TODO: see if that longer critical path causes any
	// regressions.
	return true
	}
	if aux != nil {
	if _, ok := aux.(*ir.Name); !ok {
	// Offset is probably too big (globals).
	return false
	}
	// We let *ir.Names pass here, as
	// they are probably small offsets from SP.
	// There's no guarantee that we're in range
	// in that case though (we don't know the
	// stack frame size yet), so the assembler
	// might need to issue fixup instructions.
	// Assume some small frame size.
	if off >= 0 {
	off += 120
	}
	// TODO: figure out how often this helps vs. hurts.
	}
	switch width {
	case 4:
	if off >= -256 && off <= 252 && off%4 == 0 {
	return true
	}
	case 8:
	if off >= -512 && off <= 504 && off%8 == 0 {
	return true
	}
	}
	return false
	}

	func pairLoads(f *Func) {
	var loads []*Value

	// Registry of aux values for sorting.
	auxIDs := map[Aux]int{}
	auxID := func(aux Aux) int {
	id, ok := auxIDs[aux]
	if !ok {
	id = len(auxIDs)
	auxIDs[aux] = id
	}
	return id
	}

	for _, b := range f.Blocks {
	// Find loads.
	loads = loads[:0]
	clear(auxIDs)
	for _, v := range b.Values {
	info := pairableLoads[v.Op]
	if info.width == 0 {
	continue // not pairable
	}
	if !offsetOk(v.Aux, v.AuxInt, info.width) {
	continue // not advisable
	}
	loads = append(loads, v)
	}
	if len(loads) < 2 {
	continue
	}

	// Sort to put pairable loads together.
	slices.SortFunc(loads, func(x, y *Value) int {
	// First sort by op, ptr, and memory arg.
	if x.Op != y.Op {
	return int(x.Op - y.Op)
	}
	if x.Args[0].ID != y.Args[0].ID {
	return int(x.Args[0].ID - y.Args[0].ID)
	}
	if x.Args[1].ID != y.Args[1].ID {
	return int(x.Args[1].ID - y.Args[1].ID)
	}
	// Then sort by aux. (nil first, then by aux ID)
	if x.Aux != nil {
	if y.Aux == nil {
	return 1
	}
	a, b := auxID(x.Aux), auxID(y.Aux)
	if a != b {
	return a - b
	}
	} else if y.Aux != nil {
	return -1
	}
	// Then sort by offset, low to high.
	return int(x.AuxInt - y.AuxInt)
	})

	// Look for pairable loads.
	for i := 0; i < len(loads)-1; i++ {
	x := loads[i]
	y := loads[i+1]
	if x.Op != y.Op \|\| x.Args[0] != y.Args[0] \|\| x.Args[1] != y.Args[1] {
	continue
	}
	if x.Aux != y.Aux {
	continue
	}
	if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
	continue
	}

	// Commit point.

	// Make the 2-register load.
	load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])

	// Modify x to be (Select0 load). Similar for y.
	x.reset(OpSelect0)
	x.SetArgs1(load)
	y.reset(OpSelect1)
	y.SetArgs1(load)

	i++ // Skip y next time around the loop.
	}
	}

	// Try to pair a load with a load from a subsequent block.
	// Note that this is always safe to do if the memory arguments match.
	// (But see the memory barrier case below.)
	type nextBlockKey struct {
	op Op
	ptr ID
	mem ID
	auxInt int64
	aux any
	}
	nextBlock := map[nextBlockKey]*Value{}
	for _, b := range f.Blocks {
	if memoryBarrierTest(b) {
	// TODO: Do we really need to skip write barrier test blocks?
	// type T struct {
	// a *byte
	// b int
	// }
	// func f(t *T) int {
	// r := t.b
	// t.a = nil
	// return r
	// }
	// This would issue a single LDP for both the t.a and t.b fields,
	// before we check the write barrier flag. (We load the t.a field
	// to put it in the write barrier buffer.) Not sure if that is ok.
	continue
	}
	// Find loads in the next block(s) that we can move to this one.
	// TODO: could maybe look further than just one successor hop.
	clear(nextBlock)
	for _, e := range b.Succs {
	if len(e.b.Preds) > 1 {
	continue
	}
	for _, v := range e.b.Values {
	info := pairableLoads[v.Op]
	if info.width == 0 {
	continue
	}
	if !offsetOk(v.Aux, v.AuxInt, info.width) {
	continue // not advisable
	}
	nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v
	}
	}
	if len(nextBlock) == 0 {
	continue
	}
	// don't move too many loads. Each requires a register across a basic block boundary.
	const maxMoved = 4
	nMoved := 0
	for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- {
	x := b.Values[i]
	info := pairableLoads[x.Op]
	if info.width == 0 {
	continue
	}
	if !offsetOk(x.Aux, x.AuxInt, info.width) {
	continue // not advisable
	}
	key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux}
	if y := nextBlock[key]; y != nil {
	delete(nextBlock, key)

	// Make the 2-register load.
	load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])

	// Modify x to be (Select0 load).
	x.reset(OpSelect0)
	x.SetArgs1(load)
	// Modify y to be (Copy (Select1 load)).
	// Note: the Select* needs to live in the load's block, not y's block.
	y.reset(OpCopy)
	y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load))
	nMoved++
	continue
	}
	key.auxInt = x.AuxInt - info.width
	if y := nextBlock[key]; y != nil {
	delete(nextBlock, key)

	// Make the 2-register load.
	load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1])

	// Modify x to be (Select1 load).
	x.reset(OpSelect1)
	x.SetArgs1(load)
	// Modify y to be (Copy (Select0 load)).
	y.reset(OpCopy)
	y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load))
	nMoved++
	continue
	}
	}
	}
	}

	func memoryBarrierTest(b *Block) bool {
	if b.Kind != BlockARM64NZW {
	return false
	}
	c := b.Controls[0]
	if c.Op != OpARM64MOVWUload {
	return false
	}
	if globl, ok := c.Aux.(*obj.LSym); ok {
	return globl.Name == "runtime.writeBarrier"
	}
	return false
	}

	// pairStores merges store instructions.
	// It collects stores into a buffer where they can be freely reordered.
	// When encountering an instruction that cannot be added to the buffer,
	// it pairs the accumulated stores, flushes the buffer, and continues processing.
	func pairStores(f *Func) {
	last := f.Cache.allocBoolSlice(f.NumValues())
	defer f.Cache.freeBoolSlice(last)

	// memChain contains a list of stores with the same ptr/aux pair and
	// nonoverlapping write ranges [AuxInt:AuxInt+writeSize]. All of the
	// elements of memChain can be reordered with each other.
	memChain := []*Value{}

	// Limit of length of memChain array.
	// This keeps us in O(n) territory.
	limit := 100

	// flushMemChain sorts the stores in memChain and merges them when possible.
	// Then it flushes memChain.
	flushMemChain := func() {
	if len(memChain) < 2 {
	memChain = memChain[:0]
	return
	}

	// Sort in increasing AuxInt to put pairable stores together.
	slices.SortFunc(memChain, func(x, y *Value) int {
	return int(x.AuxInt - y.AuxInt)
	})

	lastIdx := len(memChain) - 1
	for i := 0; i < lastIdx; i++ {
	v := memChain[i]
	w := memChain[i+1]
	info := pairableStores[v.Op]

	off := v.AuxInt
	mem := v.MemoryArg()
	aux := v.Aux
	pos := v.Pos
	wmem := w.MemoryArg()

	if w.Op == v.Op && w.AuxInt == off+info.width {
	// Arguments for the merged store: ptr, val1, val2, mem.
	args := []*Value{v.Args[0], v.Args[1], w.Args[1], mem}

	v.reset(info.pair)
	v.AddArgs(args...)
	v.Aux = aux
	v.AuxInt = off
	v.Pos = pos

	// Make w just a memory copy.
	w.reset(OpCopy)
	w.SetArgs1(wmem)

	// Skip merged store (w)
	i++
	}
	}

	memChain = memChain[:0]
	}

	// prevStore returns the previous store in the
	// same block, or nil if there are none.
	prevStore := func(v Value) Value {
	if v.Op == OpInitMem \|\| v.Op == OpPhi {
	return nil
	}
	m := v.MemoryArg()
	if m.Block != v.Block {
	return nil
	}
	return m
	}

	// storeWidth returns the width of store,
	// or 0 if it is not a store this pass understands.
	storeWidth := func(op Op) int64 {
	var width int64
	switch op {
	case OpARM64MOVDstore, OpARM64FMOVDstore:
	width = 8
	case OpARM64MOVWstore, OpARM64FMOVSstore:
	width = 4
	case OpARM64MOVHstore:
	width = 2
	case OpARM64MOVBstore:
	width = 1
	default:
	width = 0
	}
	return width
	}

	for _, b := range f.Blocks {
	memChain = memChain[:0]

	// Find last store in block, so we can
	// walk the stores last to first.
	// Last to first helps ensure that the rewrites we
	// perform do not get in the way of subsequent rewrites.
	for _, v := range b.Values {
	if v.Type.IsMemory() {
	last[v.ID] = true
	}
	}
	for _, v := range b.Values {
	if v.Type.IsMemory() {
	if m := prevStore(v); m != nil {
	last[m.ID] = false
	}
	}
	}
	var lastMem *Value
	for _, v := range b.Values {
	if last[v.ID] {
	lastMem = v
	break
	}
	}

	// Iterate over memory stores, accumulating them in memChain for potential merging.
	// Flush the chain when reordering is unsafe or a conflict is detected.
	for v := lastMem; v != nil; v = prevStore(v) {
	writeSize := storeWidth(v.Op)

	if writeSize == 0 {
	// We can't reorder stores with calls or other instructions
	// with writeSize == 0.
	flushMemChain()
	continue
	}
	if v.Uses != 1 && len(memChain) > 0 \|\|
	len(memChain) > 0 && (v.Args[0] != memChain[0].Args[0] \|\| v.Aux != memChain[0].Aux) \|\|
	len(memChain) == limit {
	// If v has multiple uses and it is not the latest store in the chain,
	// we cannot merge it with other store instructions.
	// If v has a different base pointer or Aux value from the current chain,
	// we need to flush memChain and start a new one with v.
	// If memChain length limit is exceeded, we also need to flush the chain
	// and start a new one with v.
	// Only look back so far.
	// This keeps us in O(n) territory, and it
	// also prevents us from keeping values
	// in registers for too long (and thus
	// needing to spill them).
	flushMemChain()
	}

	for _, w := range memChain {
	wWriteSize := storeWidth(w.Op)
	if overlap(w.AuxInt, wWriteSize, v.AuxInt, writeSize) {
	// Aliases with w's location.
	// Flush the chain and start a new one with v.
	flushMemChain()
	break
	}
	}

	memChain = append(memChain, v)
	}
	flushMemChain()
	}
	}