blob: 5493e5f1092b08653c67ed04575938d4cf096592 [file] [log] [blame] [edit]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"fmt"
"internal/goarch"
"slices"
)
var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}
func (slop SIMDLogicalOP) String() string {
if slop == sloInterior {
return "leaf"
}
interior := ""
if slop&sloInterior != 0 {
interior = "+interior"
}
switch slop &^ sloInterior {
case sloAnd:
return "and" + interior
case sloXor:
return "xor" + interior
case sloOr:
return "or" + interior
case sloAndNot:
return "andNot" + interior
case sloNot:
return "not" + interior
}
return "wrong"
}
func rewriteTern(f *Func) {
if f.maxCPUFeatures == CPUNone {
return
}
arch := f.Config.Ctxt().Arch.Family
// TODO there are other SIMD architectures
if arch != goarch.AMD64 {
return
}
boolExprTrees := make(map[*Value]SIMDLogicalOP)
// Find logical-expr expression trees, including leaves.
// interior nodes will be marked sloInterior,
// root nodes will not be marked sloInterior,
// leaf nodes are only marked sloInterior.
for _, b := range f.Blocks {
for _, v := range b.Values {
slo := classifyBooleanSIMD(v)
switch slo {
case sloOr,
sloAndNot,
sloXor,
sloAnd:
boolExprTrees[v.Args[1]] |= sloInterior
fallthrough
case sloNot:
boolExprTrees[v.Args[0]] |= sloInterior
boolExprTrees[v] |= slo
}
}
}
// get a canonical sorted set of roots
var roots []*Value
for v, slo := range boolExprTrees {
if f.pass.debug > 1 {
f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
}
if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
roots = append(roots, v)
}
}
slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.
// This rewrite works by iterating over the root set.
// For each boolean expression, it walks the expression
// bottom up accumulating sets of variables mentioned in
// subexpressions, lazy-greedily finding the largest subexpressions
// of 3 inputs that can be rewritten to use ternary-truth-table instructions.
// rewrite recursively attempts to replace v and v's subexpressions with
// ternary-logic truth-table operations, returning a set of not more than 3
// subexpressions within v that may be combined into a parent's replacement.
// V need not have the CPU features that allow a ternary-logic operation;
// in that case, v will not be rewritten. Replacements also require
// exactly 3 different variable inputs to a boolean expression.
//
// Given the CPU feature and 3 inputs, v is replaced in the following
// cases:
//
// 1) v is a root
// 2) u = NOT(v) and u lacks the CPU feature
// 3) u = OP(v, w) and u lacks the CPU feature
// 4) u = OP(v, w) and u has more than 3 variable inputs. var rewrite func(v *Value) [3]*Value
var rewrite func(v *Value) [3]*Value
// computeTT returns the truth table for a boolean expression
// over the variables in vars, where vars[0] varies slowest in
// the truth table and vars[2] varies fastest.
// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
// x: 0 0 0 0 1 1 1 1
// y: 0 0 1 1 0 0 1 1
// z: 0 1 0 1 0 1 0 1
var computeTT func(v *Value, vars [3]*Value) uint8
// combine two sets of variables into one, returning ok/not
// if the two sets contained 3 or fewer elements. Combine
// ensures that the sets of Values never contain duplicates.
// (Duplicates would create less-efficient code, not incorrect code.)
combine := func(a, b [3]*Value) ([3]*Value, bool) {
var c [3]*Value
i := 0
for _, v := range a {
if v == nil {
break
}
c[i] = v
i++
}
bloop:
for _, v := range b {
if v == nil {
break
}
for _, u := range a {
if v == u {
continue bloop
}
}
if i == 3 {
return [3]*Value{}, false
}
c[i] = v
i++
}
return c, true
}
computeTT = func(v *Value, vars [3]*Value) uint8 {
i := 0
for ; i < len(vars); i++ {
if vars[i] == v {
return truthTableValues[i]
}
}
slo := boolExprTrees[v] &^ sloInterior
a := computeTT(v.Args[0], vars)
switch slo {
case sloNot:
return ^a
case sloAnd:
return a & computeTT(v.Args[1], vars)
case sloXor:
return a ^ computeTT(v.Args[1], vars)
case sloOr:
return a | computeTT(v.Args[1], vars)
case sloAndNot:
return a & ^computeTT(v.Args[1], vars)
}
panic("switch should have covered all cases, or unknown var in logical expression")
}
replace := func(a0 *Value, vars0 [3]*Value) {
imm := computeTT(a0, vars0)
op := ternOpForLogical(a0.Op)
if op == a0.Op {
panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString()))
}
if f.pass.debug > 0 {
f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
vars0[0], vars0[1], vars0[2])
}
a0.reset(op)
a0.SetArgs3(vars0[0], vars0[1], vars0[2])
a0.AuxInt = int64(int8(imm))
}
// addOne ensures the no-duplicates addition of a single value
// to a set that is not full. It seems possible that a shared
// subexpression in tricky combination with blocks lacking the
// AVX512 feature might permit this.
addOne := func(vars [3]*Value, v *Value) [3]*Value {
if vars[2] != nil {
panic("rewriteTern.addOne, vars[2] should be nil")
}
if v == vars[0] || v == vars[1] {
return vars
}
if vars[1] == nil {
vars[1] = v
} else {
vars[2] = v
}
return vars
}
rewrite = func(v *Value) [3]*Value {
slo := boolExprTrees[v]
if slo == sloInterior { // leaf node, i.e., a "variable"
return [3]*Value{v, nil, nil}
}
var vars [3]*Value
hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
if slo&sloNot == sloNot {
vars = rewrite(v.Args[0])
if !hasFeature {
if vars[2] != nil {
replace(v.Args[0], vars)
return [3]*Value{v, nil, nil}
}
return vars
}
} else {
var ok bool
a0, a1 := v.Args[0], v.Args[1]
vars0 := rewrite(a0)
vars1 := rewrite(a1)
vars, ok = combine(vars0, vars1)
if f.pass.debug > 1 {
f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
}
if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
// too many variables, or cannot rewrite current values.
// rewrite one or both subtrees if possible
if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
replace(a0, vars0)
}
if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
replace(a1, vars1)
}
// 3-element var arrays are either rewritten, or unable to be rewritten
// because of the features in effect in their block. Either way, they
// are treated as a "new var" if 3 elements are present.
if vars0[2] == nil {
if vars1[2] == nil {
// both subtrees are 2-element and were not rewritten.
//
// TODO a clever person would look at subtrees of inputs,
// e.g. rewrite
// ((a AND b) XOR b) XOR (d XOR (c AND d))
// to (((a AND b) XOR b) XOR d) XOR (c AND d)
// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
// and return the variable set {v, c, d}
//
// But for now, just restart with a0 and a1.
return [3]*Value{a0, a1, nil}
} else {
// a1 (maybe) rewrote, a0 has room for another var
vars = addOne(vars0, a1)
}
} else if vars1[2] == nil {
// a0 (maybe) rewrote, a1 has room for another var
vars = addOne(vars1, a0)
} else if !ok {
// both (maybe) rewrote
// a0 and a1 are different because otherwise their variable
// sets would have combined "ok".
return [3]*Value{a0, a1, nil}
}
// continue with either the vars from "ok" or the updated set of vars.
}
}
// if root and 3 vars and hasFeature, rewrite.
if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
replace(v, vars)
return [3]*Value{v, nil, nil}
}
return vars
}
for _, v := range roots {
if f.pass.debug > 1 {
f.Warnl(v.Pos, "SLO root %s", v.LongString())
}
rewrite(v)
}
}