blob: be524dc0a6c5543c93da57580d0a1e8f11e8d6e0 [file] [edit]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"cmd/compile/internal/ir"
"cmd/compile/internal/types"
"cmd/internal/obj"
"slices"
)
// The pair pass finds memory operations that can be paired up
// into single 2-register memory instructions.
func pair(f *Func) {
// Only arm64 for now. This pass is fairly arch-specific.
switch f.Config.arch {
case "arm64":
default:
return
}
pairLoads(f)
pairStores(f)
}
type pairableLoadInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}
// All pairableLoad ops must take 2 arguments, a pointer and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableLoads = map[Op]pairableLoadInfo{
OpARM64MOVDload: {8, OpARM64LDP},
OpARM64MOVWUload: {4, OpARM64LDPW},
OpARM64MOVWload: {4, OpARM64LDPSW},
// TODO: conceivably we could pair a signed and unsigned load
// if we knew the upper bits of one of them weren't being used.
OpARM64FMOVDload: {8, OpARM64FLDPD},
OpARM64FMOVSload: {4, OpARM64FLDPS},
}
type pairableStoreInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}
// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableStores = map[Op]pairableStoreInfo{
OpARM64MOVDstore: {8, OpARM64STP},
OpARM64MOVWstore: {4, OpARM64STPW},
OpARM64FMOVDstore: {8, OpARM64FSTPD},
OpARM64FMOVSstore: {4, OpARM64FSTPS},
}
// offsetOk returns true if a pair instruction should be used
// for the offset Aux+off, when the data width (of the
// unpaired instructions) is width.
// This function is best-effort. The compiled function must
// still work if offsetOk always returns true.
// TODO: this is currently arm64-specific.
func offsetOk(aux Aux, off, width int64) bool {
if true {
// Seems to generate slightly smaller code if we just
// always allow this rewrite.
//
// Without pairing, we have 2 load instructions, like:
// LDR 88(R0), R1
// LDR 96(R0), R2
// with pairing we have, best case:
// LDP 88(R0), R1, R2
// but maybe we need an adjuster if out of range or unaligned:
// ADD R0, $88, R27
// LDP (R27), R1, R2
// Even with the adjuster, it is at least no worse.
//
// A similar situation occurs when accessing globals.
// Two loads from globals requires 4 instructions,
// two ADRP and two LDR. With pairing, we need
// ADRP+ADD+LDP, three instructions.
//
// With pairing, it looks like the critical path might
// be a little bit longer. But it should never be more
// instructions.
// TODO: see if that longer critical path causes any
// regressions.
return true
}
if aux != nil {
if _, ok := aux.(*ir.Name); !ok {
// Offset is probably too big (globals).
return false
}
// We let *ir.Names pass here, as
// they are probably small offsets from SP.
// There's no guarantee that we're in range
// in that case though (we don't know the
// stack frame size yet), so the assembler
// might need to issue fixup instructions.
// Assume some small frame size.
if off >= 0 {
off += 120
}
// TODO: figure out how often this helps vs. hurts.
}
switch width {
case 4:
if off >= -256 && off <= 252 && off%4 == 0 {
return true
}
case 8:
if off >= -512 && off <= 504 && off%8 == 0 {
return true
}
}
return false
}
func pairLoads(f *Func) {
var loads []*Value
// Registry of aux values for sorting.
auxIDs := map[Aux]int{}
auxID := func(aux Aux) int {
id, ok := auxIDs[aux]
if !ok {
id = len(auxIDs)
auxIDs[aux] = id
}
return id
}
for _, b := range f.Blocks {
// Find loads.
loads = loads[:0]
clear(auxIDs)
for _, v := range b.Values {
info := pairableLoads[v.Op]
if info.width == 0 {
continue // not pairable
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // not advisable
}
loads = append(loads, v)
}
if len(loads) < 2 {
continue
}
// Sort to put pairable loads together.
slices.SortFunc(loads, func(x, y *Value) int {
// First sort by op, ptr, and memory arg.
if x.Op != y.Op {
return int(x.Op - y.Op)
}
if x.Args[0].ID != y.Args[0].ID {
return int(x.Args[0].ID - y.Args[0].ID)
}
if x.Args[1].ID != y.Args[1].ID {
return int(x.Args[1].ID - y.Args[1].ID)
}
// Then sort by aux. (nil first, then by aux ID)
if x.Aux != nil {
if y.Aux == nil {
return 1
}
a, b := auxID(x.Aux), auxID(y.Aux)
if a != b {
return a - b
}
} else if y.Aux != nil {
return -1
}
// Then sort by offset, low to high.
return int(x.AuxInt - y.AuxInt)
})
// Look for pairable loads.
for i := 0; i < len(loads)-1; i++ {
x := loads[i]
y := loads[i+1]
if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
continue
}
if x.Aux != y.Aux {
continue
}
if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
continue
}
// Commit point.
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select0 load). Similar for y.
x.reset(OpSelect0)
x.SetArgs1(load)
y.reset(OpSelect1)
y.SetArgs1(load)
i++ // Skip y next time around the loop.
}
}
// Try to pair a load with a load from a subsequent block.
// Note that this is always safe to do if the memory arguments match.
// (But see the memory barrier case below.)
type nextBlockKey struct {
op Op
ptr ID
mem ID
auxInt int64
aux any
}
nextBlock := map[nextBlockKey]*Value{}
for _, b := range f.Blocks {
if memoryBarrierTest(b) {
// TODO: Do we really need to skip write barrier test blocks?
// type T struct {
// a *byte
// b int
// }
// func f(t *T) int {
// r := t.b
// t.a = nil
// return r
// }
// This would issue a single LDP for both the t.a and t.b fields,
// *before* we check the write barrier flag. (We load the t.a field
// to put it in the write barrier buffer.) Not sure if that is ok.
continue
}
// Find loads in the next block(s) that we can move to this one.
// TODO: could maybe look further than just one successor hop.
clear(nextBlock)
for _, e := range b.Succs {
if len(e.b.Preds) > 1 {
continue
}
for _, v := range e.b.Values {
info := pairableLoads[v.Op]
if info.width == 0 {
continue
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // not advisable
}
nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v
}
}
if len(nextBlock) == 0 {
continue
}
// don't move too many loads. Each requires a register across a basic block boundary.
const maxMoved = 4
nMoved := 0
for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- {
x := b.Values[i]
info := pairableLoads[x.Op]
if info.width == 0 {
continue
}
if !offsetOk(x.Aux, x.AuxInt, info.width) {
continue // not advisable
}
key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux}
if y := nextBlock[key]; y != nil {
delete(nextBlock, key)
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select0 load).
x.reset(OpSelect0)
x.SetArgs1(load)
// Modify y to be (Copy (Select1 load)).
// Note: the Select* needs to live in the load's block, not y's block.
y.reset(OpCopy)
y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load))
nMoved++
continue
}
key.auxInt = x.AuxInt - info.width
if y := nextBlock[key]; y != nil {
delete(nextBlock, key)
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select1 load).
x.reset(OpSelect1)
x.SetArgs1(load)
// Modify y to be (Copy (Select0 load)).
y.reset(OpCopy)
y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load))
nMoved++
continue
}
}
}
}
func memoryBarrierTest(b *Block) bool {
if b.Kind != BlockARM64NZW {
return false
}
c := b.Controls[0]
if c.Op != OpARM64MOVWUload {
return false
}
if globl, ok := c.Aux.(*obj.LSym); ok {
return globl.Name == "runtime.writeBarrier"
}
return false
}
// pairStores merges store instructions.
// It collects stores into a buffer where they can be freely reordered.
// When encountering an instruction that cannot be added to the buffer,
// it pairs the accumulated stores, flushes the buffer, and continues processing.
func pairStores(f *Func) {
last := f.Cache.allocBoolSlice(f.NumValues())
defer f.Cache.freeBoolSlice(last)
// memChain contains a list of stores with the same ptr/aux pair and
// nonoverlapping write ranges [AuxInt:AuxInt+writeSize]. All of the
// elements of memChain can be reordered with each other.
memChain := []*Value{}
// Limit of length of memChain array.
// This keeps us in O(n) territory.
limit := 100
// flushMemChain sorts the stores in memChain and merges them when possible.
// Then it flushes memChain.
flushMemChain := func() {
if len(memChain) < 2 {
memChain = memChain[:0]
return
}
// Sort in increasing AuxInt to put pairable stores together.
slices.SortFunc(memChain, func(x, y *Value) int {
return int(x.AuxInt - y.AuxInt)
})
lastIdx := len(memChain) - 1
for i := 0; i < lastIdx; i++ {
v := memChain[i]
w := memChain[i+1]
info := pairableStores[v.Op]
off := v.AuxInt
mem := v.MemoryArg()
aux := v.Aux
pos := v.Pos
wmem := w.MemoryArg()
if w.Op == v.Op && w.AuxInt == off+info.width {
// Arguments for the merged store: ptr, val1, val2, mem.
args := []*Value{v.Args[0], v.Args[1], w.Args[1], mem}
v.reset(info.pair)
v.AddArgs(args...)
v.Aux = aux
v.AuxInt = off
v.Pos = pos
// Make w just a memory copy.
w.reset(OpCopy)
w.SetArgs1(wmem)
// Skip merged store (w)
i++
}
}
memChain = memChain[:0]
}
// prevStore returns the previous store in the
// same block, or nil if there are none.
prevStore := func(v *Value) *Value {
if v.Op == OpInitMem || v.Op == OpPhi {
return nil
}
m := v.MemoryArg()
if m.Block != v.Block {
return nil
}
return m
}
// storeWidth returns the width of store,
// or 0 if it is not a store this pass understands.
storeWidth := func(op Op) int64 {
var width int64
switch op {
case OpARM64MOVDstore, OpARM64FMOVDstore:
width = 8
case OpARM64MOVWstore, OpARM64FMOVSstore:
width = 4
case OpARM64MOVHstore:
width = 2
case OpARM64MOVBstore:
width = 1
default:
width = 0
}
return width
}
for _, b := range f.Blocks {
memChain = memChain[:0]
// Find last store in block, so we can
// walk the stores last to first.
// Last to first helps ensure that the rewrites we
// perform do not get in the way of subsequent rewrites.
for _, v := range b.Values {
if v.Type.IsMemory() {
last[v.ID] = true
}
}
for _, v := range b.Values {
if v.Type.IsMemory() {
if m := prevStore(v); m != nil {
last[m.ID] = false
}
}
}
var lastMem *Value
for _, v := range b.Values {
if last[v.ID] {
lastMem = v
break
}
}
// Iterate over memory stores, accumulating them in memChain for potential merging.
// Flush the chain when reordering is unsafe or a conflict is detected.
for v := lastMem; v != nil; v = prevStore(v) {
writeSize := storeWidth(v.Op)
if writeSize == 0 {
// We can't reorder stores with calls or other instructions
// with writeSize == 0.
flushMemChain()
continue
}
if v.Uses != 1 && len(memChain) > 0 ||
len(memChain) > 0 && (v.Args[0] != memChain[0].Args[0] || v.Aux != memChain[0].Aux) ||
len(memChain) == limit {
// If v has multiple uses and it is not the latest store in the chain,
// we cannot merge it with other store instructions.
// If v has a different base pointer or Aux value from the current chain,
// we need to flush memChain and start a new one with v.
// If memChain length limit is exceeded, we also need to flush the chain
// and start a new one with v.
// Only look back so far.
// This keeps us in O(n) territory, and it
// also prevents us from keeping values
// in registers for too long (and thus
// needing to spill them).
flushMemChain()
}
for _, w := range memChain {
wWriteSize := storeWidth(w.Op)
if overlap(w.AuxInt, wWriteSize, v.AuxInt, writeSize) {
// Aliases with w's location.
// Flush the chain and start a new one with v.
flushMemChain()
break
}
}
memChain = append(memChain, v)
}
flushMemChain()
}
}