blob: 1a08d792f430d12ca4b460e394029df8f13d5e5c [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unify
import (
"fmt"
"iter"
"reflect"
"strings"
)
// An envSet is an immutable set of environments, where each environment is a
// mapping from [ident]s to [Value]s.
//
// To keep this compact, we use an algebraic representation similar to
// relational algebra. The atoms are zero, unit, or a singular binding:
//
// - A singular binding {x: v} is an environment set consisting of a single
// environment that binds a single ident x to a single value v.
//
// - Zero (0) is the empty set.
//
// - Unit (1) is an environment set consisting of a single, empty environment
// (no bindings).
//
// From these, we build up more complex sets of environments using sums and
// cross products:
//
// - A sum, E + F, is simply the union of the two environment sets: E ∪ F
//
// - A cross product, E ⨯ F, is the Cartesian product of the two environment
// sets, followed by joining each pair of environments: {e ⊕ f | (e, f) ∊ E ⨯ F}
//
// The join of two environments, e ⊕ f, is an environment that contains all of
// the bindings in either e or f. To detect bugs, it is an error if an
// identifier is bound in both e and f (however, see below for what we could do
// differently).
//
// Environment sets form a commutative semiring and thus obey the usual
// commutative semiring rules:
//
// e + 0 = e
// e ⨯ 0 = 0
// e ⨯ 1 = e
// e + f = f + e
// e ⨯ f = f ⨯ e
//
// Furthermore, environments sets are additively and multiplicatively idempotent
// because + and ⨯ are themselves defined in terms of sets:
//
// e + e = e
// e ⨯ e = e
//
// # Examples
//
// To represent {{x: 1, y: 1}, {x: 2, y: 2}}, we build the two environments and
// sum them:
//
// ({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})
//
// If we add a third variable z that can be 1 or 2, independent of x and y, we
// get four logical environments:
//
// {x: 1, y: 1, z: 1}
// {x: 2, y: 2, z: 1}
// {x: 1, y: 1, z: 2}
// {x: 2, y: 2, z: 2}
//
// This could be represented as a sum of all four environments, but because z is
// independent, we can use a more compact representation:
//
// (({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})) ⨯ ({z: 1} + {z: 2})
//
// # Generalized cross product
//
// While cross-product is currently restricted to disjoint environments, we
// could generalize the definition of joining two environments to:
//
// {xₖ: vₖ} ⊕ {xₖ: wₖ} = {xₖ: vₖ ∩ wₖ} (where unbound idents are bound to the [Top] value, ⟙)
//
// where v ∩ w is the unification of v and w. This itself could be coarsened to
//
// v ∩ w = v if w = ⟙
// = w if v = ⟙
// = v if v = w
// = 0 otherwise
//
// We could use this rule to implement substitution. For example, E ⨯ {x: 1}
// narrows environment set E to only environments in which x is bound to 1. But
// we currently don't do this.
type envSet struct {
root *envExpr
}
type envExpr struct {
// TODO: A tree-based data structure for this may not be ideal, since it
// involves a lot of walking to find things and we often have to do deep
// rewrites anyway for partitioning. Would some flattened array-style
// representation be better, possibly combined with an index of ident uses?
// We could even combine that with an immutable array abstraction (ala
// Clojure) that could enable more efficient construction operations.
kind envExprKind
// For envBinding
id *ident
val *Value
// For sum or product. Len must be >= 2 and none of the elements can have
// the same kind as this node.
operands []*envExpr
}
type envExprKind byte
const (
envZero envExprKind = iota
envUnit
envProduct
envSum
envBinding
)
var (
// topEnv is the unit value (multiplicative identity) of a [envSet].
topEnv = envSet{envExprUnit}
// bottomEnv is the zero value (additive identity) of a [envSet].
bottomEnv = envSet{envExprZero}
envExprZero = &envExpr{kind: envZero}
envExprUnit = &envExpr{kind: envUnit}
)
// bind binds id to each of vals in e.
//
// Its panics if id is already bound in e.
//
// Environments are typically initially constructed by starting with [topEnv]
// and calling bind one or more times.
func (e envSet) bind(id *ident, vals ...*Value) envSet {
if e.isEmpty() {
return bottomEnv
}
// TODO: If any of vals are _, should we just drop that val? We're kind of
// inconsistent about whether an id missing from e means id is invalid or
// means id is _.
// Check that id isn't present in e.
for range e.root.bindings(id) {
panic("id " + id.name + " already present in environment")
}
// Create a sum of all the values.
bindings := make([]*envExpr, 0, 1)
for _, val := range vals {
bindings = append(bindings, &envExpr{kind: envBinding, id: id, val: val})
}
// Multiply it in.
return envSet{newEnvExprProduct(e.root, newEnvExprSum(bindings...))}
}
func (e envSet) isEmpty() bool {
return e.root.kind == envZero
}
// bindings yields all [envBinding] nodes in e with the given id. If id is nil,
// it yields all binding nodes.
func (e *envExpr) bindings(id *ident) iter.Seq[*envExpr] {
// This is just a pre-order walk and it happens this is the only thing we
// need a pre-order walk for.
return func(yield func(*envExpr) bool) {
var rec func(e *envExpr) bool
rec = func(e *envExpr) bool {
if e.kind == envBinding && (id == nil || e.id == id) {
if !yield(e) {
return false
}
}
for _, o := range e.operands {
if !rec(o) {
return false
}
}
return true
}
rec(e)
}
}
// newEnvExprProduct constructs a product node from exprs, performing
// simplifications. It does NOT check that bindings are disjoint.
func newEnvExprProduct(exprs ...*envExpr) *envExpr {
factors := make([]*envExpr, 0, 2)
for _, expr := range exprs {
switch expr.kind {
case envZero:
return envExprZero
case envUnit:
// No effect on product
case envProduct:
factors = append(factors, expr.operands...)
default:
factors = append(factors, expr)
}
}
if len(factors) == 0 {
return envExprUnit
} else if len(factors) == 1 {
return factors[0]
}
return &envExpr{kind: envProduct, operands: factors}
}
// newEnvExprSum constructs a sum node from exprs, performing simplifications.
func newEnvExprSum(exprs ...*envExpr) *envExpr {
// TODO: If all of envs are products (or bindings), factor any common terms.
// E.g., x * y + x * z ==> x * (y + z). This is easy to do for binding
// terms, but harder to do for more general terms.
var have smallSet[*envExpr]
terms := make([]*envExpr, 0, 2)
for _, expr := range exprs {
switch expr.kind {
case envZero:
// No effect on sum
case envSum:
for _, expr1 := range expr.operands {
if have.Add(expr1) {
terms = append(terms, expr1)
}
}
default:
if have.Add(expr) {
terms = append(terms, expr)
}
}
}
if len(terms) == 0 {
return envExprZero
} else if len(terms) == 1 {
return terms[0]
}
return &envExpr{kind: envSum, operands: terms}
}
func crossEnvs(env1, env2 envSet) envSet {
// Confirm that envs have disjoint idents.
var ids1 smallSet[*ident]
for e := range env1.root.bindings(nil) {
ids1.Add(e.id)
}
for e := range env2.root.bindings(nil) {
if ids1.Has(e.id) {
panic(fmt.Sprintf("%s bound on both sides of cross-product", e.id.name))
}
}
return envSet{newEnvExprProduct(env1.root, env2.root)}
}
func unionEnvs(envs ...envSet) envSet {
exprs := make([]*envExpr, len(envs))
for i := range envs {
exprs[i] = envs[i].root
}
return envSet{newEnvExprSum(exprs...)}
}
// envPartition is a subset of an env where id is bound to value in all
// deterministic environments.
type envPartition struct {
id *ident
value *Value
env envSet
}
// partitionBy splits e by distinct bindings of id and removes id from each
// partition.
//
// If there are environments in e where id is not bound, they will not be
// reflected in any partition.
//
// It panics if e is bottom, since attempting to partition an empty environment
// set almost certainly indicates a bug.
func (e envSet) partitionBy(id *ident) []envPartition {
if e.isEmpty() {
// We could return zero partitions, but getting here at all almost
// certainly indicates a bug.
panic("cannot partition empty environment set")
}
// Emit a partition for each value of id.
var seen smallSet[*Value]
var parts []envPartition
for n := range e.root.bindings(id) {
if !seen.Add(n.val) {
// Already emitted a partition for this value.
continue
}
parts = append(parts, envPartition{
id: id,
value: n.val,
env: envSet{e.root.substitute(id, n.val)},
})
}
return parts
}
// substitute replaces bindings of id to val with 1 and bindings of id to any
// other value with 0 and simplifies the result.
func (e *envExpr) substitute(id *ident, val *Value) *envExpr {
switch e.kind {
default:
panic("bad kind")
case envZero, envUnit:
return e
case envBinding:
if e.id != id {
return e
} else if e.val != val {
return envExprZero
} else {
return envExprUnit
}
case envProduct, envSum:
// Substitute each operand. Sometimes, this won't change anything, so we
// build the new operands list lazily.
var nOperands []*envExpr
for i, op := range e.operands {
nOp := op.substitute(id, val)
if nOperands == nil && op != nOp {
// Operand diverged; initialize nOperands.
nOperands = make([]*envExpr, 0, len(e.operands))
nOperands = append(nOperands, e.operands[:i]...)
}
if nOperands != nil {
nOperands = append(nOperands, nOp)
}
}
if nOperands == nil {
// Nothing changed.
return e
}
if e.kind == envProduct {
return newEnvExprProduct(nOperands...)
} else {
return newEnvExprSum(nOperands...)
}
}
}
// A smallSet is a set optimized for stack allocation when small.
type smallSet[T comparable] struct {
array [32]T
n int
m map[T]struct{}
}
// Has returns whether val is in set.
func (s *smallSet[T]) Has(val T) bool {
arr := s.array[:s.n]
for i := range arr {
if arr[i] == val {
return true
}
}
_, ok := s.m[val]
return ok
}
// Add adds val to the set and returns true if it was added (not already
// present).
func (s *smallSet[T]) Add(val T) bool {
// Test for presence.
if s.Has(val) {
return false
}
// Add it
if s.n < len(s.array) {
s.array[s.n] = val
s.n++
} else {
if s.m == nil {
s.m = make(map[T]struct{})
}
s.m[val] = struct{}{}
}
return true
}
type ident struct {
_ [0]func() // Not comparable (only compare *ident)
name string
}
type Var struct {
id *ident
}
func (d Var) Exact() bool {
// These can't appear in concrete Values.
panic("Exact called on non-concrete Value")
}
func (d Var) WhyNotExact() string {
// These can't appear in concrete Values.
return "WhyNotExact called on non-concrete Value"
}
func (d Var) decode(rv reflect.Value) error {
return &inexactError{"var", rv.Type().String()}
}
func (d Var) unify(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
// TODO: Vars from !sums in the input can have a huge number of values.
// Unifying these could be way more efficient with some indexes over any
// exact values we can pull out, like Def fields that are exact Strings.
// Maybe we try to produce an array of yes/no/maybe matches and then we only
// have to do deeper evaluation of the maybes. We could probably cache this
// on an envTerm. It may also help to special-case Var/Var unification to
// pick which one to index versus enumerate.
if vd, ok := w.Domain.(Var); ok && d.id == vd.id {
// Unifying $x with $x results in $x. If we descend into this we'll have
// problems because we strip $x out of the environment to keep ourselves
// honest and then can't find it on the other side.
//
// TODO: I'm not positive this is the right fix.
return vd, e, nil
}
// We need to unify w with the value of d in each possible environment. We
// can save some work by grouping environments by the value of d, since
// there will be a lot of redundancy here.
var nEnvs []envSet
envParts := e.partitionBy(d.id)
for i, envPart := range envParts {
exit := uf.enterVar(d.id, i)
// Each branch logically gets its own copy of the initial environment
// (narrowed down to just this binding of the variable), and each branch
// may result in different changes to that starting environment.
res, e2, err := w.unify(envPart.value, envPart.env, swap, uf)
exit.exit()
if err != nil {
return nil, envSet{}, err
}
if res.Domain == nil {
// This branch entirely failed to unify, so it's gone.
continue
}
nEnv := e2.bind(d.id, res)
nEnvs = append(nEnvs, nEnv)
}
if len(nEnvs) == 0 {
// All branches failed
return nil, bottomEnv, nil
}
// The effect of this is entirely captured in the environment. We can return
// back the same Bind node.
return d, unionEnvs(nEnvs...), nil
}
// An identPrinter maps [ident]s to unique string names.
type identPrinter struct {
ids map[*ident]string
idGen map[string]int
}
func (p *identPrinter) unique(id *ident) string {
if p.ids == nil {
p.ids = make(map[*ident]string)
p.idGen = make(map[string]int)
}
name, ok := p.ids[id]
if !ok {
gen := p.idGen[id.name]
p.idGen[id.name]++
if gen == 0 {
name = id.name
} else {
name = fmt.Sprintf("%s#%d", id.name, gen)
}
p.ids[id] = name
}
return name
}
func (p *identPrinter) slice(ids []*ident) string {
var strs []string
for _, id := range ids {
strs = append(strs, p.unique(id))
}
return fmt.Sprintf("[%s]", strings.Join(strs, ", "))
}