blob: 47477e76ddd7406f5d9409e47b57e63bc88c69fa [file] [log] [blame] [edit]
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"cmd/compile/internal/base"
"cmd/compile/internal/types"
"cmd/internal/src"
"cmp"
"slices"
)
// memcombine combines smaller loads and stores into larger ones.
// We ensure this generates good code for encoding/binary operations.
// It may help other cases also.
func memcombine(f *Func) {
// This optimization requires that the architecture has
// unaligned loads and unaligned stores.
if !f.Config.unalignedOK {
return
}
memcombineLoads(f)
memcombineStores(f)
}
func memcombineLoads(f *Func) {
// Find "OR trees" to start with.
mark := f.newSparseSet(f.NumValues())
defer f.retSparseSet(mark)
var order []*Value
// Mark all values that are the argument of an OR.
for _, b := range f.Blocks {
for _, v := range b.Values {
if v.Op == OpOr16 || v.Op == OpOr32 || v.Op == OpOr64 {
mark.add(v.Args[0].ID)
mark.add(v.Args[1].ID)
}
}
}
for _, b := range f.Blocks {
order = order[:0]
for _, v := range b.Values {
if v.Op != OpOr16 && v.Op != OpOr32 && v.Op != OpOr64 {
continue
}
if mark.contains(v.ID) {
// marked - means it is not the root of an OR tree
continue
}
// Add the OR tree rooted at v to the order.
// We use BFS here, but any walk that puts roots before leaves would work.
i := len(order)
order = append(order, v)
for ; i < len(order); i++ {
x := order[i]
for j := 0; j < 2; j++ {
a := x.Args[j]
if a.Op == OpOr16 || a.Op == OpOr32 || a.Op == OpOr64 {
order = append(order, a)
}
}
}
}
for _, v := range order {
max := f.Config.RegSize
switch v.Op {
case OpOr64:
case OpOr32:
max = 4
case OpOr16:
max = 2
default:
continue
}
for n := max; n > 1; n /= 2 {
if combineLoads(v, n) {
break
}
}
}
}
}
// A BaseAddress represents the address ptr+idx, where
// ptr is a pointer type and idx is an integer type.
// idx may be nil, in which case it is treated as 0.
type BaseAddress struct {
ptr *Value
idx *Value
}
// splitPtr returns the base address of ptr and any
// constant offset from that base.
// BaseAddress{ptr,nil},0 is always a valid result, but splitPtr
// tries to peel away as many constants into off as possible.
func splitPtr(ptr *Value) (BaseAddress, int64) {
var idx *Value
var off int64
for {
if ptr.Op == OpOffPtr {
off += ptr.AuxInt
ptr = ptr.Args[0]
} else if ptr.Op == OpAddPtr {
if idx != nil {
// We have two or more indexing values.
// Pick the first one we found.
return BaseAddress{ptr: ptr, idx: idx}, off
}
idx = ptr.Args[1]
if idx.Op == OpAdd32 || idx.Op == OpAdd64 {
if idx.Args[0].Op == OpConst32 || idx.Args[0].Op == OpConst64 {
off += idx.Args[0].AuxInt
idx = idx.Args[1]
} else if idx.Args[1].Op == OpConst32 || idx.Args[1].Op == OpConst64 {
off += idx.Args[1].AuxInt
idx = idx.Args[0]
}
}
ptr = ptr.Args[0]
} else {
return BaseAddress{ptr: ptr, idx: idx}, off
}
}
}
func combineLoads(root *Value, n int64) bool {
orOp := root.Op
var shiftOp Op
switch orOp {
case OpOr64:
shiftOp = OpLsh64x64
case OpOr32:
shiftOp = OpLsh32x64
case OpOr16:
shiftOp = OpLsh16x64
default:
return false
}
// Find n values that are ORed together with the above op.
a := make([]*Value, 0, 8)
a = append(a, root)
for i := 0; i < len(a) && int64(len(a)) < n; i++ {
v := a[i]
if v.Uses != 1 && v != root {
// Something in this subtree is used somewhere else.
return false
}
if v.Op == orOp {
a[i] = v.Args[0]
a = append(a, v.Args[1])
i--
}
}
if int64(len(a)) != n {
return false
}
// Check that the first entry to see what ops we're looking for.
// All the entries should be of the form shift(extend(load)), maybe with no shift.
v := a[0]
if v.Op == shiftOp {
v = v.Args[0]
}
var extOp Op
if orOp == OpOr64 && (v.Op == OpZeroExt8to64 || v.Op == OpZeroExt16to64 || v.Op == OpZeroExt32to64) ||
orOp == OpOr32 && (v.Op == OpZeroExt8to32 || v.Op == OpZeroExt16to32) ||
orOp == OpOr16 && v.Op == OpZeroExt8to16 {
extOp = v.Op
v = v.Args[0]
} else {
return false
}
if v.Op != OpLoad {
return false
}
base, _ := splitPtr(v.Args[0])
mem := v.Args[1]
size := v.Type.Size()
if root.Block.Func.Config.arch == "S390X" {
// s390x can't handle unaligned accesses to global variables.
if base.ptr.Op == OpAddr {
return false
}
}
// Check all the entries, extract useful info.
type LoadRecord struct {
load *Value
offset int64 // offset of load address from base
shift int64
}
r := make([]LoadRecord, n, 8)
for i := int64(0); i < n; i++ {
v := a[i]
if v.Uses != 1 {
return false
}
shift := int64(0)
if v.Op == shiftOp {
if v.Args[1].Op != OpConst64 {
return false
}
shift = v.Args[1].AuxInt
v = v.Args[0]
if v.Uses != 1 {
return false
}
}
if v.Op != extOp {
return false
}
load := v.Args[0]
if load.Op != OpLoad {
return false
}
if load.Uses != 1 {
return false
}
if load.Args[1] != mem {
return false
}
p, off := splitPtr(load.Args[0])
if p != base {
return false
}
r[i] = LoadRecord{load: load, offset: off, shift: shift}
}
// Sort in memory address order.
slices.SortFunc(r, func(a, b LoadRecord) int {
return cmp.Compare(a.offset, b.offset)
})
// Check that we have contiguous offsets.
for i := int64(0); i < n; i++ {
if r[i].offset != r[0].offset+i*size {
return false
}
}
// Check for reads in little-endian or big-endian order.
shift0 := r[0].shift
isLittleEndian := true
for i := int64(0); i < n; i++ {
if r[i].shift != shift0+i*size*8 {
isLittleEndian = false
break
}
}
isBigEndian := true
for i := int64(0); i < n; i++ {
if r[i].shift != shift0-i*size*8 {
isBigEndian = false
break
}
}
if !isLittleEndian && !isBigEndian {
return false
}
// Find a place to put the new load.
// This is tricky, because it has to be at a point where
// its memory argument is live. We can't just put it in root.Block.
// We use the block of the latest load.
loads := make([]*Value, n, 8)
for i := int64(0); i < n; i++ {
loads[i] = r[i].load
}
loadBlock := mergePoint(root.Block, loads...)
if loadBlock == nil {
return false
}
// Find a source position to use.
pos := src.NoXPos
for _, load := range loads {
if load.Block == loadBlock {
pos = load.Pos
break
}
}
if pos == src.NoXPos {
return false
}
// Check to see if we need byte swap before storing.
needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
isBigEndian && !root.Block.Func.Config.BigEndian
if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
return false
}
// This is the commit point.
// First, issue load at lowest address.
v = loadBlock.NewValue2(pos, OpLoad, sizeType(n*size), r[0].load.Args[0], mem)
// Byte swap if needed,
if needSwap {
v = byteSwap(loadBlock, pos, v)
}
// Extend if needed.
if n*size < root.Type.Size() {
v = zeroExtend(loadBlock, pos, v, n*size, root.Type.Size())
}
// Shift if needed.
if isLittleEndian && shift0 != 0 {
v = leftShift(loadBlock, pos, v, shift0)
}
if isBigEndian && shift0-(n-1)*size*8 != 0 {
v = leftShift(loadBlock, pos, v, shift0-(n-1)*size*8)
}
// Install with (Copy v).
root.reset(OpCopy)
root.AddArg(v)
// Clobber the loads, just to prevent additional work being done on
// subtrees (which are now unreachable).
for i := int64(0); i < n; i++ {
clobber(r[i].load)
}
return true
}
func memcombineStores(f *Func) {
mark := f.newSparseSet(f.NumValues())
defer f.retSparseSet(mark)
var order []*Value
for _, b := range f.Blocks {
// Mark all stores which are not last in a store sequence.
mark.clear()
for _, v := range b.Values {
if v.Op == OpStore {
mark.add(v.MemoryArg().ID)
}
}
// pick an order for visiting stores such that
// later stores come earlier in the ordering.
order = order[:0]
for _, v := range b.Values {
if v.Op != OpStore {
continue
}
if mark.contains(v.ID) {
continue // not last in a chain of stores
}
for {
order = append(order, v)
v = v.Args[2]
if v.Block != b || v.Op != OpStore {
break
}
}
}
// Look for combining opportunities at each store in queue order.
for _, v := range order {
if v.Op != OpStore { // already rewritten
continue
}
size := v.Aux.(*types.Type).Size()
if size >= f.Config.RegSize || size == 0 {
continue
}
for n := f.Config.RegSize / size; n > 1; n /= 2 {
if combineStores(v, n) {
continue
}
}
}
}
}
// Try to combine the n stores ending in root.
// Returns true if successful.
func combineStores(root *Value, n int64) bool {
// Helper functions.
type StoreRecord struct {
store *Value
offset int64
}
getShiftBase := func(a []StoreRecord) *Value {
x := a[0].store.Args[1]
y := a[1].store.Args[1]
switch x.Op {
case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
x = x.Args[0]
default:
return nil
}
switch y.Op {
case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
y = y.Args[0]
default:
return nil
}
var x2 *Value
switch x.Op {
case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
x2 = x.Args[0]
default:
}
var y2 *Value
switch y.Op {
case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
y2 = y.Args[0]
default:
}
if y2 == x {
// a shift of x and x itself.
return x
}
if x2 == y {
// a shift of y and y itself.
return y
}
if x2 == y2 {
// 2 shifts both of the same argument.
return x2
}
return nil
}
isShiftBase := func(v, base *Value) bool {
val := v.Args[1]
switch val.Op {
case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
val = val.Args[0]
default:
return false
}
if val == base {
return true
}
switch val.Op {
case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
val = val.Args[0]
default:
return false
}
return val == base
}
shift := func(v, base *Value) int64 {
val := v.Args[1]
switch val.Op {
case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
val = val.Args[0]
default:
return -1
}
if val == base {
return 0
}
switch val.Op {
case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
val = val.Args[1]
default:
return -1
}
if val.Op != OpConst64 {
return -1
}
return val.AuxInt
}
// Element size of the individual stores.
size := root.Aux.(*types.Type).Size()
if size*n > root.Block.Func.Config.RegSize {
return false
}
// Gather n stores to look at. Check easy conditions we require.
a := make([]StoreRecord, 0, 8)
rbase, roff := splitPtr(root.Args[0])
if root.Block.Func.Config.arch == "S390X" {
// s390x can't handle unaligned accesses to global variables.
if rbase.ptr.Op == OpAddr {
return false
}
}
a = append(a, StoreRecord{root, roff})
for i, x := int64(1), root.Args[2]; i < n; i, x = i+1, x.Args[2] {
if x.Op != OpStore {
return false
}
if x.Block != root.Block {
return false
}
if x.Uses != 1 { // Note: root can have more than one use.
return false
}
if x.Aux.(*types.Type).Size() != size {
// TODO: the constant source and consecutive load source cases
// do not need all the stores to be the same size.
return false
}
base, off := splitPtr(x.Args[0])
if base != rbase {
return false
}
a = append(a, StoreRecord{x, off})
}
// Before we sort, grab the memory arg the result should have.
mem := a[n-1].store.Args[2]
// Also grab position of first store (last in array = first in memory order).
pos := a[n-1].store.Pos
// Sort stores in increasing address order.
slices.SortFunc(a, func(sr1, sr2 StoreRecord) int {
return cmp.Compare(sr1.offset, sr2.offset)
})
// Check that everything is written to sequential locations.
for i := int64(0); i < n; i++ {
if a[i].offset != a[0].offset+i*size {
return false
}
}
// Memory location we're going to write at (the lowest one).
ptr := a[0].store.Args[0]
// Check for constant stores
isConst := true
for i := int64(0); i < n; i++ {
switch a[i].store.Args[1].Op {
case OpConst32, OpConst16, OpConst8, OpConstBool:
default:
isConst = false
break
}
}
if isConst {
// Modify root to do all the stores.
var c int64
mask := int64(1)<<(8*size) - 1
for i := int64(0); i < n; i++ {
s := 8 * size * int64(i)
if root.Block.Func.Config.BigEndian {
s = 8*size*(n-1) - s
}
c |= (a[i].store.Args[1].AuxInt & mask) << s
}
var cv *Value
switch size * n {
case 2:
cv = root.Block.Func.ConstInt16(types.Types[types.TUINT16], int16(c))
case 4:
cv = root.Block.Func.ConstInt32(types.Types[types.TUINT32], int32(c))
case 8:
cv = root.Block.Func.ConstInt64(types.Types[types.TUINT64], c)
}
// Move all the stores to the root.
for i := int64(0); i < n; i++ {
v := a[i].store
if v == root {
v.Aux = cv.Type // widen store type
v.Pos = pos
v.SetArg(0, ptr)
v.SetArg(1, cv)
v.SetArg(2, mem)
} else {
clobber(v)
v.Type = types.Types[types.TBOOL] // erase memory type
}
}
return true
}
// Check for consecutive loads as the source of the stores.
var loadMem *Value
var loadBase BaseAddress
var loadIdx int64
for i := int64(0); i < n; i++ {
load := a[i].store.Args[1]
if load.Op != OpLoad {
loadMem = nil
break
}
if load.Uses != 1 {
loadMem = nil
break
}
if load.Type.IsPtr() {
// Don't combine stores containing a pointer, as we need
// a write barrier for those. This can't currently happen,
// but might in the future if we ever have another
// 8-byte-reg/4-byte-ptr architecture like amd64p32.
loadMem = nil
break
}
mem := load.Args[1]
base, idx := splitPtr(load.Args[0])
if loadMem == nil {
// First one we found
loadMem = mem
loadBase = base
loadIdx = idx
continue
}
if base != loadBase || mem != loadMem {
loadMem = nil
break
}
if idx != loadIdx+(a[i].offset-a[0].offset) {
loadMem = nil
break
}
}
if loadMem != nil {
// Modify the first load to do a larger load instead.
load := a[0].store.Args[1]
switch size * n {
case 2:
load.Type = types.Types[types.TUINT16]
case 4:
load.Type = types.Types[types.TUINT32]
case 8:
load.Type = types.Types[types.TUINT64]
}
// Modify root to do the store.
for i := int64(0); i < n; i++ {
v := a[i].store
if v == root {
v.Aux = load.Type // widen store type
v.Pos = pos
v.SetArg(0, ptr)
v.SetArg(1, load)
v.SetArg(2, mem)
} else {
clobber(v)
v.Type = types.Types[types.TBOOL] // erase memory type
}
}
return true
}
// Check that all the shift/trunc are of the same base value.
shiftBase := getShiftBase(a)
if shiftBase == nil {
return false
}
for i := int64(0); i < n; i++ {
if !isShiftBase(a[i].store, shiftBase) {
return false
}
}
// Check for writes in little-endian or big-endian order.
isLittleEndian := true
shift0 := shift(a[0].store, shiftBase)
for i := int64(1); i < n; i++ {
if shift(a[i].store, shiftBase) != shift0+i*size*8 {
isLittleEndian = false
break
}
}
isBigEndian := true
for i := int64(1); i < n; i++ {
if shift(a[i].store, shiftBase) != shift0-i*size*8 {
isBigEndian = false
break
}
}
if !isLittleEndian && !isBigEndian {
return false
}
// Check to see if we need byte swap before storing.
needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
isBigEndian && !root.Block.Func.Config.BigEndian
if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
return false
}
// This is the commit point.
// Modify root to do all the stores.
sv := shiftBase
if isLittleEndian && shift0 != 0 {
sv = rightShift(root.Block, root.Pos, sv, shift0)
}
if isBigEndian && shift0-(n-1)*size*8 != 0 {
sv = rightShift(root.Block, root.Pos, sv, shift0-(n-1)*size*8)
}
if sv.Type.Size() > size*n {
sv = truncate(root.Block, root.Pos, sv, sv.Type.Size(), size*n)
}
if needSwap {
sv = byteSwap(root.Block, root.Pos, sv)
}
// Move all the stores to the root.
for i := int64(0); i < n; i++ {
v := a[i].store
if v == root {
v.Aux = sv.Type // widen store type
v.Pos = pos
v.SetArg(0, ptr)
v.SetArg(1, sv)
v.SetArg(2, mem)
} else {
clobber(v)
v.Type = types.Types[types.TBOOL] // erase memory type
}
}
return true
}
func sizeType(size int64) *types.Type {
switch size {
case 8:
return types.Types[types.TUINT64]
case 4:
return types.Types[types.TUINT32]
case 2:
return types.Types[types.TUINT16]
default:
base.Fatalf("bad size %d\n", size)
return nil
}
}
func truncate(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
switch from*10 + to {
case 82:
return b.NewValue1(pos, OpTrunc64to16, types.Types[types.TUINT16], v)
case 84:
return b.NewValue1(pos, OpTrunc64to32, types.Types[types.TUINT32], v)
case 42:
return b.NewValue1(pos, OpTrunc32to16, types.Types[types.TUINT16], v)
default:
base.Fatalf("bad sizes %d %d\n", from, to)
return nil
}
}
func zeroExtend(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
switch from*10 + to {
case 24:
return b.NewValue1(pos, OpZeroExt16to32, types.Types[types.TUINT32], v)
case 28:
return b.NewValue1(pos, OpZeroExt16to64, types.Types[types.TUINT64], v)
case 48:
return b.NewValue1(pos, OpZeroExt32to64, types.Types[types.TUINT64], v)
default:
base.Fatalf("bad sizes %d %d\n", from, to)
return nil
}
}
func leftShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
size := v.Type.Size()
switch size {
case 8:
return b.NewValue2(pos, OpLsh64x64, v.Type, v, s)
case 4:
return b.NewValue2(pos, OpLsh32x64, v.Type, v, s)
case 2:
return b.NewValue2(pos, OpLsh16x64, v.Type, v, s)
default:
base.Fatalf("bad size %d\n", size)
return nil
}
}
func rightShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
size := v.Type.Size()
switch size {
case 8:
return b.NewValue2(pos, OpRsh64Ux64, v.Type, v, s)
case 4:
return b.NewValue2(pos, OpRsh32Ux64, v.Type, v, s)
case 2:
return b.NewValue2(pos, OpRsh16Ux64, v.Type, v, s)
default:
base.Fatalf("bad size %d\n", size)
return nil
}
}
func byteSwap(b *Block, pos src.XPos, v *Value) *Value {
switch v.Type.Size() {
case 8:
return b.NewValue1(pos, OpBswap64, v.Type, v)
case 4:
return b.NewValue1(pos, OpBswap32, v.Type, v)
case 2:
return b.NewValue1(pos, OpBswap16, v.Type, v)
default:
v.Fatalf("bad size %d\n", v.Type.Size())
return nil
}
}