src/cmd/compile/internal/ssa/rewritetern.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package ssa

 import (
 	"fmt"
 	"internal/goarch"
 	"slices"
 )

 var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}

 func (slop SIMDLogicalOP) String() string {
 	if slop == sloInterior {
 		return "leaf"
 	}
 	interior := ""
 	if slop&sloInterior != 0 {
 		interior = "+interior"
 	}
 	switch slop &^ sloInterior {
 	case sloAnd:
 		return "and" + interior
 	case sloXor:
 		return "xor" + interior
 	case sloOr:
 		return "or" + interior
 	case sloAndNot:
 		return "andNot" + interior
 	case sloNot:
 		return "not" + interior
 	}
 	return "wrong"
 }

 func rewriteTern(f *Func) {
 	if f.maxCPUFeatures == CPUNone {
 		return
 	}

 	arch := f.Config.Ctxt().Arch.Family
 	// TODO there are other SIMD architectures
 	if arch != goarch.AMD64 {
 		return
 	}

 	boolExprTrees := make(map[*Value]SIMDLogicalOP)

 	// Find logical-expr expression trees, including leaves.
 	// interior nodes will be marked sloInterior,
 	// root nodes will not be marked sloInterior,
 	// leaf nodes are only marked sloInterior.
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			slo := classifyBooleanSIMD(v)
 			switch slo {
 			case sloOr,
 				sloAndNot,
 				sloXor,
 				sloAnd:
 				boolExprTrees[v.Args[1]] |= sloInterior
 				fallthrough
 			case sloNot:
 				boolExprTrees[v.Args[0]] |= sloInterior
 				boolExprTrees[v] |= slo
 			}
 		}
 	}

 	// get a canonical sorted set of roots
 	var roots []*Value
 	for v, slo := range boolExprTrees {
 		if f.pass.debug > 1 {
 			f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
 		}

 		if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
 			roots = append(roots, v)
 		}
 	}
 	slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.

 	// This rewrite works by iterating over the root set.
 	// For each boolean expression, it walks the expression
 	// bottom up accumulating sets of variables mentioned in
 	// subexpressions, lazy-greedily finding the largest subexpressions
 	// of 3 inputs that can be rewritten to use ternary-truth-table instructions.

 	// rewrite recursively attempts to replace v and v's subexpressions with
 	// ternary-logic truth-table operations, returning a set of not more than 3
 	// subexpressions within v that may be combined into a parent's replacement.
 	// V need not have the CPU features that allow a ternary-logic operation;
 	// in that case, v will not be rewritten.  Replacements also require
 	// exactly 3 different variable inputs to a boolean expression.
 	//
 	// Given the CPU feature and 3 inputs, v is replaced in the following
 	// cases:
 	//
 	// 1) v is a root
 	// 2) u = NOT(v) and u lacks the CPU feature
 	// 3) u = OP(v, w) and u lacks the CPU feature
 	// 4) u = OP(v, w) and u has more than 3 variable inputs.	var rewrite func(v *Value) [3]*Value
 	var rewrite func(v *Value) [3]*Value

 	// computeTT returns the truth table for a boolean expression
 	// over the variables in vars, where vars[0] varies slowest in
 	// the truth table and vars[2] varies fastest.
 	// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
 	// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
 	//            x: 0 0 0 0 1 1 1 1
 	//            y: 0 0 1 1 0 0 1 1
 	//            z: 0 1 0 1 0 1 0 1
 	var computeTT func(v *Value, vars [3]*Value) uint8

 	// combine two sets of variables into one, returning ok/not
 	// if the two sets contained 3 or fewer elements.  Combine
 	// ensures that the sets of Values never contain duplicates.
 	// (Duplicates would create less-efficient code, not incorrect code.)
 	combine := func(a, b [3]*Value) ([3]*Value, bool) {
 		var c [3]*Value
 		i := 0
 		for _, v := range a {
 			if v == nil {
 				break
 			}
 			c[i] = v
 			i++
 		}
 	bloop:
 		for _, v := range b {
 			if v == nil {
 				break
 			}
 			for _, u := range a {
 				if v == u {
 					continue bloop
 				}
 			}
 			if i == 3 {
 				return [3]*Value{}, false
 			}
 			c[i] = v
 			i++
 		}
 		return c, true
 	}

 	computeTT = func(v *Value, vars [3]*Value) uint8 {
 		i := 0
 		for ; i < len(vars); i++ {
 			if vars[i] == v {
 				return truthTableValues[i]
 			}
 		}
 		slo := boolExprTrees[v] &^ sloInterior
 		a := computeTT(v.Args[0], vars)
 		switch slo {
 		case sloNot:
 			return ^a
 		case sloAnd:
 			return a & computeTT(v.Args[1], vars)
 		case sloXor:
 			return a ^ computeTT(v.Args[1], vars)
 		case sloOr:
 			return a | computeTT(v.Args[1], vars)
 		case sloAndNot:
 			return a & ^computeTT(v.Args[1], vars)
 		}
 		panic("switch should have covered all cases, or unknown var in logical expression")
 	}

 	replace := func(a0 *Value, vars0 [3]*Value) {
 		imm := computeTT(a0, vars0)
 		op := ternOpForLogical(a0.Op)
 		if op == a0.Op {
 			panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString()))
 		}
 		if f.pass.debug > 0 {
 			f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
 				vars0[0], vars0[1], vars0[2])
 		}
 		a0.reset(op)
 		a0.SetArgs3(vars0[0], vars0[1], vars0[2])
 		a0.AuxInt = int64(int8(imm))
 	}

 	// addOne ensures the no-duplicates addition of a single value
 	// to a set that is not full.  It seems possible that a shared
 	// subexpression in tricky combination with blocks lacking the
 	// AVX512 feature might permit this.
 	addOne := func(vars [3]*Value, v *Value) [3]*Value {
 		if vars[2] != nil {
 			panic("rewriteTern.addOne, vars[2] should be nil")
 		}
 		if v == vars[0] || v == vars[1] {
 			return vars
 		}
 		if vars[1] == nil {
 			vars[1] = v
 		} else {
 			vars[2] = v
 		}
 		return vars
 	}

 	rewrite = func(v *Value) [3]*Value {
 		slo := boolExprTrees[v]
 		if slo == sloInterior { // leaf node, i.e., a "variable"
 			return [3]*Value{v, nil, nil}
 		}
 		var vars [3]*Value
 		hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
 		if slo&sloNot == sloNot {
 			vars = rewrite(v.Args[0])
 			if !hasFeature {
 				if vars[2] != nil {
 					replace(v.Args[0], vars)
 					return [3]*Value{v, nil, nil}
 				}
 				return vars
 			}
 		} else {
 			var ok bool
 			a0, a1 := v.Args[0], v.Args[1]
 			vars0 := rewrite(a0)
 			vars1 := rewrite(a1)
 			vars, ok = combine(vars0, vars1)

 			if f.pass.debug > 1 {
 				f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
 			}

 			if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
 				// too many variables, or cannot rewrite current values.
 				// rewrite one or both subtrees if possible
 				if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
 					replace(a0, vars0)
 				}
 				if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
 					replace(a1, vars1)
 				}

 				// 3-element var arrays are either rewritten, or unable to be rewritten
 				// because of the features in effect in their block.  Either way, they
 				// are treated as a "new var" if 3 elements are present.

 				if vars0[2] == nil {
 					if vars1[2] == nil {
 						// both subtrees are 2-element and were not rewritten.
 						//
 						// TODO a clever person would look at subtrees of inputs,
 						// e.g. rewrite
 						//        ((a AND b) XOR b) XOR (d  XOR (c AND d))
 						// to    (((a AND b) XOR b) XOR  d) XOR (c AND d)
 						// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
 						// and return the variable set {v, c, d}
 						//
 						// But for now, just restart with a0 and a1.
 						return [3]*Value{a0, a1, nil}
 					} else {
 						// a1 (maybe) rewrote, a0 has room for another var
 						vars = addOne(vars0, a1)
 					}
 				} else if vars1[2] == nil {
 					// a0 (maybe) rewrote, a1 has room for another var
 					vars = addOne(vars1, a0)
 				} else if !ok {
 					// both (maybe) rewrote
 					// a0 and a1 are different because otherwise their variable
 					// sets would have combined "ok".
 					return [3]*Value{a0, a1, nil}
 				}
 				// continue with either the vars from "ok" or the updated set of vars.
 			}
 		}
 		// if root and 3 vars and hasFeature, rewrite.
 		if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
 			replace(v, vars)
 			return [3]*Value{v, nil, nil}
 		}
 		return vars
 	}

 	for _, v := range roots {
 		if f.pass.debug > 1 {
 			f.Warnl(v.Pos, "SLO root %s", v.LongString())
 		}
 		rewrite(v)
 	}
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package ssa

	import (
	"fmt"
	"internal/goarch"
	"slices"
	)

	var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}

	func (slop SIMDLogicalOP) String() string {
	if slop == sloInterior {
	return "leaf"
	}
	interior := ""
	if slop&sloInterior != 0 {
	interior = "+interior"
	}
	switch slop &^ sloInterior {
	case sloAnd:
	return "and" + interior
	case sloXor:
	return "xor" + interior
	case sloOr:
	return "or" + interior
	case sloAndNot:
	return "andNot" + interior
	case sloNot:
	return "not" + interior
	}
	return "wrong"
	}

	func rewriteTern(f *Func) {
	if f.maxCPUFeatures == CPUNone {
	return
	}

	arch := f.Config.Ctxt().Arch.Family
	// TODO there are other SIMD architectures
	if arch != goarch.AMD64 {
	return
	}

	boolExprTrees := make(map[*Value]SIMDLogicalOP)

	// Find logical-expr expression trees, including leaves.
	// interior nodes will be marked sloInterior,
	// root nodes will not be marked sloInterior,
	// leaf nodes are only marked sloInterior.
	for _, b := range f.Blocks {
	for _, v := range b.Values {
	slo := classifyBooleanSIMD(v)
	switch slo {
	case sloOr,
	sloAndNot,
	sloXor,
	sloAnd:
	boolExprTrees[v.Args[1]] \|= sloInterior
	fallthrough
	case sloNot:
	boolExprTrees[v.Args[0]] \|= sloInterior
	boolExprTrees[v] \|= slo
	}
	}
	}

	// get a canonical sorted set of roots
	var roots []*Value
	for v, slo := range boolExprTrees {
	if f.pass.debug > 1 {
	f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
	}

	if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
	roots = append(roots, v)
	}
	}
	slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.

	// This rewrite works by iterating over the root set.
	// For each boolean expression, it walks the expression
	// bottom up accumulating sets of variables mentioned in
	// subexpressions, lazy-greedily finding the largest subexpressions
	// of 3 inputs that can be rewritten to use ternary-truth-table instructions.

	// rewrite recursively attempts to replace v and v's subexpressions with
	// ternary-logic truth-table operations, returning a set of not more than 3
	// subexpressions within v that may be combined into a parent's replacement.
	// V need not have the CPU features that allow a ternary-logic operation;
	// in that case, v will not be rewritten. Replacements also require
	// exactly 3 different variable inputs to a boolean expression.
	//
	// Given the CPU feature and 3 inputs, v is replaced in the following
	// cases:
	//
	// 1) v is a root
	// 2) u = NOT(v) and u lacks the CPU feature
	// 3) u = OP(v, w) and u lacks the CPU feature
	// 4) u = OP(v, w) and u has more than 3 variable inputs. var rewrite func(v Value) [3]Value
	var rewrite func(v Value) [3]Value

	// computeTT returns the truth table for a boolean expression
	// over the variables in vars, where vars[0] varies slowest in
	// the truth table and vars[2] varies fastest.
	// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
	// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
	// x: 0 0 0 0 1 1 1 1
	// y: 0 0 1 1 0 0 1 1
	// z: 0 1 0 1 0 1 0 1
	var computeTT func(v Value, vars [3]Value) uint8

	// combine two sets of variables into one, returning ok/not
	// if the two sets contained 3 or fewer elements. Combine
	// ensures that the sets of Values never contain duplicates.
	// (Duplicates would create less-efficient code, not incorrect code.)
	combine := func(a, b [3]Value) ([3]Value, bool) {
	var c [3]*Value
	i := 0
	for _, v := range a {
	if v == nil {
	break
	}
	c[i] = v
	i++
	}
	bloop:
	for _, v := range b {
	if v == nil {
	break
	}
	for _, u := range a {
	if v == u {
	continue bloop
	}
	}
	if i == 3 {
	return [3]*Value{}, false
	}
	c[i] = v
	i++
	}
	return c, true
	}

	computeTT = func(v Value, vars [3]Value) uint8 {
	i := 0
	for ; i < len(vars); i++ {
	if vars[i] == v {
	return truthTableValues[i]
	}
	}
	slo := boolExprTrees[v] &^ sloInterior
	a := computeTT(v.Args[0], vars)
	switch slo {
	case sloNot:
	return ^a
	case sloAnd:
	return a & computeTT(v.Args[1], vars)
	case sloXor:
	return a ^ computeTT(v.Args[1], vars)
	case sloOr:
	return a \| computeTT(v.Args[1], vars)
	case sloAndNot:
	return a & ^computeTT(v.Args[1], vars)
	}
	panic("switch should have covered all cases, or unknown var in logical expression")
	}

	replace := func(a0 Value, vars0 [3]Value) {
	imm := computeTT(a0, vars0)
	op := ternOpForLogical(a0.Op)
	if op == a0.Op {
	panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString()))
	}
	if f.pass.debug > 0 {
	f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
	vars0[0], vars0[1], vars0[2])
	}
	a0.reset(op)
	a0.SetArgs3(vars0[0], vars0[1], vars0[2])
	a0.AuxInt = int64(int8(imm))
	}

	// addOne ensures the no-duplicates addition of a single value
	// to a set that is not full. It seems possible that a shared
	// subexpression in tricky combination with blocks lacking the
	// AVX512 feature might permit this.
	addOne := func(vars [3]Value, v Value) [3]*Value {
	if vars[2] != nil {
	panic("rewriteTern.addOne, vars[2] should be nil")
	}
	if v == vars[0] \|\| v == vars[1] {
	return vars
	}
	if vars[1] == nil {
	vars[1] = v
	} else {
	vars[2] = v
	}
	return vars
	}

	rewrite = func(v Value) [3]Value {
	slo := boolExprTrees[v]
	if slo == sloInterior { // leaf node, i.e., a "variable"
	return [3]*Value{v, nil, nil}
	}
	var vars [3]*Value
	hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
	if slo&sloNot == sloNot {
	vars = rewrite(v.Args[0])
	if !hasFeature {
	if vars[2] != nil {
	replace(v.Args[0], vars)
	return [3]*Value{v, nil, nil}
	}
	return vars
	}
	} else {
	var ok bool
	a0, a1 := v.Args[0], v.Args[1]
	vars0 := rewrite(a0)
	vars1 := rewrite(a1)
	vars, ok = combine(vars0, vars1)

	if f.pass.debug > 1 {
	f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
	}

	if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
	// too many variables, or cannot rewrite current values.
	// rewrite one or both subtrees if possible
	if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
	replace(a0, vars0)
	}
	if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
	replace(a1, vars1)
	}

	// 3-element var arrays are either rewritten, or unable to be rewritten
	// because of the features in effect in their block. Either way, they
	// are treated as a "new var" if 3 elements are present.

	if vars0[2] == nil {
	if vars1[2] == nil {
	// both subtrees are 2-element and were not rewritten.
	//
	// TODO a clever person would look at subtrees of inputs,
	// e.g. rewrite
	// ((a AND b) XOR b) XOR (d XOR (c AND d))
	// to (((a AND b) XOR b) XOR d) XOR (c AND d)
	// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
	// and return the variable set {v, c, d}
	//
	// But for now, just restart with a0 and a1.
	return [3]*Value{a0, a1, nil}
	} else {
	// a1 (maybe) rewrote, a0 has room for another var
	vars = addOne(vars0, a1)
	}
	} else if vars1[2] == nil {
	// a0 (maybe) rewrote, a1 has room for another var
	vars = addOne(vars1, a0)
	} else if !ok {
	// both (maybe) rewrote
	// a0 and a1 are different because otherwise their variable
	// sets would have combined "ok".
	return [3]*Value{a0, a1, nil}
	}
	// continue with either the vars from "ok" or the updated set of vars.
	}
	}
	// if root and 3 vars and hasFeature, rewrite.
	if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
	replace(v, vars)
	return [3]*Value{v, nil, nil}
	}
	return vars
	}

	for _, v := range roots {
	if f.pass.debug > 1 {
	f.Warnl(v.Pos, "SLO root %s", v.LongString())
	}
	rewrite(v)
	}
	}