| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package ssa |
| |
| import ( |
| "fmt" |
| "internal/goarch" |
| "slices" |
| ) |
| |
| var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010} |
| |
| func (slop SIMDLogicalOP) String() string { |
| if slop == sloInterior { |
| return "leaf" |
| } |
| interior := "" |
| if slop&sloInterior != 0 { |
| interior = "+interior" |
| } |
| switch slop &^ sloInterior { |
| case sloAnd: |
| return "and" + interior |
| case sloXor: |
| return "xor" + interior |
| case sloOr: |
| return "or" + interior |
| case sloAndNot: |
| return "andNot" + interior |
| case sloNot: |
| return "not" + interior |
| } |
| return "wrong" |
| } |
| |
| func rewriteTern(f *Func) { |
| if f.maxCPUFeatures == CPUNone { |
| return |
| } |
| |
| arch := f.Config.Ctxt().Arch.Family |
| // TODO there are other SIMD architectures |
| if arch != goarch.AMD64 { |
| return |
| } |
| |
| boolExprTrees := make(map[*Value]SIMDLogicalOP) |
| |
| // Find logical-expr expression trees, including leaves. |
| // interior nodes will be marked sloInterior, |
| // root nodes will not be marked sloInterior, |
| // leaf nodes are only marked sloInterior. |
| for _, b := range f.Blocks { |
| for _, v := range b.Values { |
| slo := classifyBooleanSIMD(v) |
| switch slo { |
| case sloOr, |
| sloAndNot, |
| sloXor, |
| sloAnd: |
| boolExprTrees[v.Args[1]] |= sloInterior |
| fallthrough |
| case sloNot: |
| boolExprTrees[v.Args[0]] |= sloInterior |
| boolExprTrees[v] |= slo |
| } |
| } |
| } |
| |
| // get a canonical sorted set of roots |
| var roots []*Value |
| for v, slo := range boolExprTrees { |
| if f.pass.debug > 1 { |
| f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo) |
| } |
| |
| if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) { |
| roots = append(roots, v) |
| } |
| } |
| slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow. |
| |
| // This rewrite works by iterating over the root set. |
| // For each boolean expression, it walks the expression |
| // bottom up accumulating sets of variables mentioned in |
| // subexpressions, lazy-greedily finding the largest subexpressions |
| // of 3 inputs that can be rewritten to use ternary-truth-table instructions. |
| |
| // rewrite recursively attempts to replace v and v's subexpressions with |
| // ternary-logic truth-table operations, returning a set of not more than 3 |
| // subexpressions within v that may be combined into a parent's replacement. |
| // V need not have the CPU features that allow a ternary-logic operation; |
| // in that case, v will not be rewritten. Replacements also require |
| // exactly 3 different variable inputs to a boolean expression. |
| // |
| // Given the CPU feature and 3 inputs, v is replaced in the following |
| // cases: |
| // |
| // 1) v is a root |
| // 2) u = NOT(v) and u lacks the CPU feature |
| // 3) u = OP(v, w) and u lacks the CPU feature |
| // 4) u = OP(v, w) and u has more than 3 variable inputs. var rewrite func(v *Value) [3]*Value |
| var rewrite func(v *Value) [3]*Value |
| |
| // computeTT returns the truth table for a boolean expression |
| // over the variables in vars, where vars[0] varies slowest in |
| // the truth table and vars[2] varies fastest. |
| // e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns |
| // (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0 |
| // x: 0 0 0 0 1 1 1 1 |
| // y: 0 0 1 1 0 0 1 1 |
| // z: 0 1 0 1 0 1 0 1 |
| var computeTT func(v *Value, vars [3]*Value) uint8 |
| |
| // combine two sets of variables into one, returning ok/not |
| // if the two sets contained 3 or fewer elements. Combine |
| // ensures that the sets of Values never contain duplicates. |
| // (Duplicates would create less-efficient code, not incorrect code.) |
| combine := func(a, b [3]*Value) ([3]*Value, bool) { |
| var c [3]*Value |
| i := 0 |
| for _, v := range a { |
| if v == nil { |
| break |
| } |
| c[i] = v |
| i++ |
| } |
| bloop: |
| for _, v := range b { |
| if v == nil { |
| break |
| } |
| for _, u := range a { |
| if v == u { |
| continue bloop |
| } |
| } |
| if i == 3 { |
| return [3]*Value{}, false |
| } |
| c[i] = v |
| i++ |
| } |
| return c, true |
| } |
| |
| computeTT = func(v *Value, vars [3]*Value) uint8 { |
| i := 0 |
| for ; i < len(vars); i++ { |
| if vars[i] == v { |
| return truthTableValues[i] |
| } |
| } |
| slo := boolExprTrees[v] &^ sloInterior |
| a := computeTT(v.Args[0], vars) |
| switch slo { |
| case sloNot: |
| return ^a |
| case sloAnd: |
| return a & computeTT(v.Args[1], vars) |
| case sloXor: |
| return a ^ computeTT(v.Args[1], vars) |
| case sloOr: |
| return a | computeTT(v.Args[1], vars) |
| case sloAndNot: |
| return a & ^computeTT(v.Args[1], vars) |
| } |
| panic("switch should have covered all cases, or unknown var in logical expression") |
| } |
| |
| replace := func(a0 *Value, vars0 [3]*Value) { |
| imm := computeTT(a0, vars0) |
| op := ternOpForLogical(a0.Op) |
| if op == a0.Op { |
| panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString())) |
| } |
| if f.pass.debug > 0 { |
| f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm, |
| vars0[0], vars0[1], vars0[2]) |
| } |
| a0.reset(op) |
| a0.SetArgs3(vars0[0], vars0[1], vars0[2]) |
| a0.AuxInt = int64(int8(imm)) |
| } |
| |
| // addOne ensures the no-duplicates addition of a single value |
| // to a set that is not full. It seems possible that a shared |
| // subexpression in tricky combination with blocks lacking the |
| // AVX512 feature might permit this. |
| addOne := func(vars [3]*Value, v *Value) [3]*Value { |
| if vars[2] != nil { |
| panic("rewriteTern.addOne, vars[2] should be nil") |
| } |
| if v == vars[0] || v == vars[1] { |
| return vars |
| } |
| if vars[1] == nil { |
| vars[1] = v |
| } else { |
| vars[2] = v |
| } |
| return vars |
| } |
| |
| rewrite = func(v *Value) [3]*Value { |
| slo := boolExprTrees[v] |
| if slo == sloInterior { // leaf node, i.e., a "variable" |
| return [3]*Value{v, nil, nil} |
| } |
| var vars [3]*Value |
| hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512) |
| if slo&sloNot == sloNot { |
| vars = rewrite(v.Args[0]) |
| if !hasFeature { |
| if vars[2] != nil { |
| replace(v.Args[0], vars) |
| return [3]*Value{v, nil, nil} |
| } |
| return vars |
| } |
| } else { |
| var ok bool |
| a0, a1 := v.Args[0], v.Args[1] |
| vars0 := rewrite(a0) |
| vars1 := rewrite(a1) |
| vars, ok = combine(vars0, vars1) |
| |
| if f.pass.debug > 1 { |
| f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok) |
| } |
| |
| if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) { |
| // too many variables, or cannot rewrite current values. |
| // rewrite one or both subtrees if possible |
| if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) { |
| replace(a0, vars0) |
| } |
| if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) { |
| replace(a1, vars1) |
| } |
| |
| // 3-element var arrays are either rewritten, or unable to be rewritten |
| // because of the features in effect in their block. Either way, they |
| // are treated as a "new var" if 3 elements are present. |
| |
| if vars0[2] == nil { |
| if vars1[2] == nil { |
| // both subtrees are 2-element and were not rewritten. |
| // |
| // TODO a clever person would look at subtrees of inputs, |
| // e.g. rewrite |
| // ((a AND b) XOR b) XOR (d XOR (c AND d)) |
| // to (((a AND b) XOR b) XOR d) XOR (c AND d) |
| // to v = TERNLOG(truthtable, a, b, d) XOR (c AND d) |
| // and return the variable set {v, c, d} |
| // |
| // But for now, just restart with a0 and a1. |
| return [3]*Value{a0, a1, nil} |
| } else { |
| // a1 (maybe) rewrote, a0 has room for another var |
| vars = addOne(vars0, a1) |
| } |
| } else if vars1[2] == nil { |
| // a0 (maybe) rewrote, a1 has room for another var |
| vars = addOne(vars1, a0) |
| } else if !ok { |
| // both (maybe) rewrote |
| // a0 and a1 are different because otherwise their variable |
| // sets would have combined "ok". |
| return [3]*Value{a0, a1, nil} |
| } |
| // continue with either the vars from "ok" or the updated set of vars. |
| } |
| } |
| // if root and 3 vars and hasFeature, rewrite. |
| if slo&sloInterior == 0 && vars[2] != nil && hasFeature { |
| replace(v, vars) |
| return [3]*Value{v, nil, nil} |
| } |
| return vars |
| } |
| |
| for _, v := range roots { |
| if f.pass.debug > 1 { |
| f.Warnl(v.Pos, "SLO root %s", v.LongString()) |
| } |
| rewrite(v) |
| } |
| } |