src/cmd/compile/internal/ssa/cse.go - go.git - Git at Google

 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package ssa

 import (
 	"cmd/compile/internal/types"
 	"cmd/internal/src"
 	"cmp"
 	"fmt"
 	"slices"
 )

 // cse does common-subexpression elimination on the Function.
 // Values are just relinked, nothing is deleted. A subsequent deadcode
 // pass is required to actually remove duplicate expressions.
 func cse(f *Func) {
 	// Two values are equivalent if they satisfy the following definition:
 	// equivalent(v, w):
 	//   v.op == w.op
 	//   v.type == w.type
 	//   v.aux == w.aux
 	//   v.auxint == w.auxint
 	//   len(v.args) == len(w.args)
 	//   v.block == w.block if v.op == OpPhi
 	//   equivalent(v.args[i], w.args[i]) for i in 0..len(v.args)-1

 	// The algorithm searches for a partition of f's values into
 	// equivalence classes using the above definition.
 	// It starts with a coarse partition and iteratively refines it
 	// until it reaches a fixed point.

 	// Make initial coarse partitions by using a subset of the conditions above.
 	a := f.Cache.allocValueSlice(f.NumValues())
 	defer func() { f.Cache.freeValueSlice(a) }() // inside closure to use final value of a
 	a = a[:0]
 	o := f.Cache.allocInt32Slice(f.NumValues()) // the ordering score for stores
 	defer func() { f.Cache.freeInt32Slice(o) }()
 	if f.auxmap == nil {
 		f.auxmap = auxmap{}
 	}
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			if v.Type.IsMemory() {
 				continue // memory values can never cse
 			}
 			if f.auxmap[v.Aux] == 0 {
 				f.auxmap[v.Aux] = int32(len(f.auxmap)) + 1
 			}
 			a = append(a, v)
 		}
 	}
 	partition := partitionValues(a, f.auxmap)

 	// map from value id back to eqclass id
 	valueEqClass := f.Cache.allocIDSlice(f.NumValues())
 	defer f.Cache.freeIDSlice(valueEqClass)
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			// Use negative equivalence class #s for unique values.
 			valueEqClass[v.ID] = -v.ID
 		}
 	}
 	var pNum ID = 1
 	for _, e := range partition {
 		if f.pass.debug > 1 && len(e) > 500 {
 			fmt.Printf("CSE.large partition (%d): ", len(e))
 			for j := 0; j < 3; j++ {
 				fmt.Printf("%s ", e[j].LongString())
 			}
 			fmt.Println()
 		}

 		for _, v := range e {
 			valueEqClass[v.ID] = pNum
 		}
 		if f.pass.debug > 2 && len(e) > 1 {
 			fmt.Printf("CSE.partition #%d:", pNum)
 			for _, v := range e {
 				fmt.Printf(" %s", v.String())
 			}
 			fmt.Printf("\n")
 		}
 		pNum++
 	}

 	// Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load),
 	// to some dominating memory operation, skipping the memory defs that do not alias with it.
 	memTable := f.Cache.allocInt32Slice(f.NumValues())
 	defer f.Cache.freeInt32Slice(memTable)

 	// Split equivalence classes at points where they have
 	// non-equivalent arguments.  Repeat until we can't find any
 	// more splits.
 	var splitPoints []int
 	for {
 		changed := false

 		// partition can grow in the loop. By not using a range loop here,
 		// we process new additions as they arrive, avoiding O(n^2) behavior.
 		for i := 0; i < len(partition); i++ {
 			e := partition[i]

 			if opcodeTable[e[0].Op].commutative {
 				// Order the first two args before comparison.
 				for _, v := range e {
 					if valueEqClass[v.Args[0].ID] > valueEqClass[v.Args[1].ID] {
 						v.Args[0], v.Args[1] = v.Args[1], v.Args[0]
 					}
 				}
 			}

 			// Sort by eq class of arguments.
 			slices.SortFunc(e, func(v, w *Value) int {
 				_, idxMem, _, _ := isMemUser(v)
 				for i, a := range v.Args {
 					var aId, bId ID
 					if i != idxMem {
 						b := w.Args[i]
 						aId = a.ID
 						bId = b.ID
 					} else {
 						// A memory user's mem argument may be remapped to allow matching
 						// identical load-like instructions across disjoint stores.
 						aId, _ = getEffectiveMemoryArg(memTable, v)
 						bId, _ = getEffectiveMemoryArg(memTable, w)
 					}
 					if valueEqClass[aId] < valueEqClass[bId] {
 						return -1
 					}
 					if valueEqClass[aId] > valueEqClass[bId] {
 						return +1
 					}
 				}
 				return 0
 			})

 			// Find split points.
 			splitPoints = append(splitPoints[:0], 0)
 			for j := 1; j < len(e); j++ {
 				v, w := e[j-1], e[j]
 				// Note: commutative args already correctly ordered by byArgClass.
 				eqArgs := true
 				_, idxMem, _, _ := isMemUser(v)
 				for k, a := range v.Args {
 					if v.Op == OpLocalAddr && k == 1 {
 						continue
 					}
 					var aId, bId ID
 					if k != idxMem {
 						b := w.Args[k]
 						aId = a.ID
 						bId = b.ID
 					} else {
 						// A memory user's mem argument may be remapped to allow matching
 						// identical load-like instructions across disjoint stores.
 						aId, _ = getEffectiveMemoryArg(memTable, v)
 						bId, _ = getEffectiveMemoryArg(memTable, w)
 					}
 					if valueEqClass[aId] != valueEqClass[bId] {
 						eqArgs = false
 						break
 					}
 				}
 				if !eqArgs {
 					splitPoints = append(splitPoints, j)
 				}
 			}
 			if len(splitPoints) == 1 {
 				continue // no splits, leave equivalence class alone.
 			}

 			// Move another equivalence class down in place of e.
 			partition[i] = partition[len(partition)-1]
 			partition = partition[:len(partition)-1]
 			i--

 			// Add new equivalence classes for the parts of e we found.
 			splitPoints = append(splitPoints, len(e))
 			for j := 0; j < len(splitPoints)-1; j++ {
 				f := e[splitPoints[j]:splitPoints[j+1]]
 				if len(f) == 1 {
 					// Don't add singletons.
 					valueEqClass[f[0].ID] = -f[0].ID
 					continue
 				}
 				for _, v := range f {
 					valueEqClass[v.ID] = pNum
 				}
 				pNum++
 				partition = append(partition, f)
 			}
 			changed = true
 		}

 		if !changed {
 			break
 		}
 	}

 	sdom := f.Sdom()

 	// Compute substitutions we would like to do. We substitute v for w
 	// if v and w are in the same equivalence class and v dominates w.
 	rewrite := f.Cache.allocValueSlice(f.NumValues())
 	defer f.Cache.freeValueSlice(rewrite)
 	for _, e := range partition {
 		slices.SortFunc(e, func(v, w *Value) int {
 			if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 {
 				return c
 			}
 			if _, _, _, ok := isMemUser(v); ok {
 				// Additional ordering among the memory users within one block: prefer the earliest
 				// possible value among the set of equivalent values, that is the one with the lowest
 				// skip count (lowest number of memory defs skipped until their common def).
 				_, vSkips := getEffectiveMemoryArg(memTable, v)
 				_, wSkips := getEffectiveMemoryArg(memTable, w)
 				if c := cmp.Compare(vSkips, wSkips); c != 0 {
 					return c
 				}
 			}
 			if v.Op == OpLocalAddr {
 				// compare the memory args for OpLocalAddrs in the same block
 				vm := v.Args[1]
 				wm := w.Args[1]
 				if vm == wm {
 					return 0
 				}
 				// if the two OpLocalAddrs are in the same block, and one's memory
 				// arg also in the same block, but the other one's memory arg not,
 				// the latter must be in an ancestor block
 				if vm.Block != v.Block {
 					return -1
 				}
 				if wm.Block != w.Block {
 					return +1
 				}
 				// use store order if the memory args are in the same block
 				vs := storeOrdering(vm, o)
 				ws := storeOrdering(wm, o)
 				if vs <= 0 {
 					f.Fatalf("unable to determine the order of %s", vm.LongString())
 				}
 				if ws <= 0 {
 					f.Fatalf("unable to determine the order of %s", wm.LongString())
 				}
 				return cmp.Compare(vs, ws)
 			}
 			vStmt := v.Pos.IsStmt() == src.PosIsStmt
 			wStmt := w.Pos.IsStmt() == src.PosIsStmt
 			if vStmt != wStmt {
 				if vStmt {
 					return -1
 				}
 				return +1
 			}
 			return 0
 		})

 		for i := 0; i < len(e)-1; i++ {
 			// e is sorted by domorder, so a maximal dominant element is first in the slice
 			v := e[i]
 			if v == nil {
 				continue
 			}

 			e[i] = nil
 			// Replace all elements of e which v dominates
 			for j := i + 1; j < len(e); j++ {
 				w := e[j]
 				if w == nil {
 					continue
 				}
 				if sdom.IsAncestorEq(v.Block, w.Block) {
 					rewrite[w.ID] = v
 					e[j] = nil
 				} else {
 					// e is sorted by domorder, so v.Block doesn't dominate any subsequent blocks in e
 					break
 				}
 			}
 		}
 	}

 	rewrites := int64(0)

 	// Apply substitutions
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			for i, w := range v.Args {
 				if x := rewrite[w.ID]; x != nil {
 					if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck {
 						// about to lose a statement marker, w
 						// w is an input to v; if they're in the same block
 						// and the same line, v is a good-enough new statement boundary.
 						if w.Block == v.Block && w.Pos.Line() == v.Pos.Line() {
 							v.Pos = v.Pos.WithIsStmt()
 							w.Pos = w.Pos.WithNotStmt()
 						} // TODO and if this fails?
 					}
 					v.SetArg(i, x)
 					rewrites++
 				}
 			}
 		}
 		for i, v := range b.ControlValues() {
 			if x := rewrite[v.ID]; x != nil {
 				if v.Op == OpNilCheck {
 					// nilcheck pass will remove the nil checks and log
 					// them appropriately, so don't mess with them here.
 					continue
 				}
 				b.ReplaceControl(i, x)
 			}
 		}
 	}

 	if f.pass.stats > 0 {
 		f.LogStat("CSE REWRITES", rewrites)
 	}
 }

 // storeOrdering computes the order for stores by iterate over the store
 // chain, assigns a score to each store. The scores only make sense for
 // stores within the same block, and the first store by store order has
 // the lowest score. The cache was used to ensure only compute once.
 func storeOrdering(v *Value, cache []int32) int32 {
 	const minScore int32 = 1
 	score := minScore
 	w := v
 	for {
 		if s := cache[w.ID]; s >= minScore {
 			score += s
 			break
 		}
 		if w.Op == OpPhi || w.Op == OpInitMem {
 			break
 		}
 		a := w.MemoryArg()
 		if a.Block != w.Block {
 			break
 		}
 		w = a
 		score++
 	}
 	w = v
 	for cache[w.ID] == 0 {
 		cache[w.ID] = score
 		if score == minScore {
 			break
 		}
 		w = w.MemoryArg()
 		score--
 	}
 	return cache[v.ID]
 }

 // An eqclass approximates an equivalence class. During the
 // algorithm it may represent the union of several of the
 // final equivalence classes.
 type eqclass []*Value

 // partitionValues partitions the values into equivalence classes
 // based on having all the following features match:
 //   - opcode
 //   - type
 //   - auxint
 //   - aux
 //   - nargs
 //   - block # if a phi op
 //   - first two arg's opcodes and auxint
 //   - NOT first two arg's aux; that can break CSE.
 //
 // partitionValues returns a list of equivalence classes, each
 // being a sorted by ID list of *Values. The eqclass slices are
 // backed by the same storage as the input slice.
 // Equivalence classes of size 1 are ignored.
 func partitionValues(a []*Value, auxIDs auxmap) []eqclass {
 	slices.SortFunc(a, func(v, w *Value) int {
 		switch cmpVal(v, w, auxIDs) {
 		case types.CMPlt:
 			return -1
 		case types.CMPgt:
 			return +1
 		default:
 			// Sort by value ID last to keep the sort result deterministic.
 			return cmp.Compare(v.ID, w.ID)
 		}
 	})

 	var partition []eqclass
 	for len(a) > 0 {
 		v := a[0]
 		j := 1
 		for ; j < len(a); j++ {
 			w := a[j]
 			if cmpVal(v, w, auxIDs) != types.CMPeq {
 				break
 			}
 		}
 		if j > 1 {
 			partition = append(partition, a[:j])
 		}
 		a = a[j:]
 	}

 	return partition
 }
 func lt2Cmp(isLt bool) types.Cmp {
 	if isLt {
 		return types.CMPlt
 	}
 	return types.CMPgt
 }

 type auxmap map[Aux]int32

 func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp {
 	// Try to order these comparison by cost (cheaper first)
 	if v.Op != w.Op {
 		return lt2Cmp(v.Op < w.Op)
 	}
 	if v.AuxInt != w.AuxInt {
 		return lt2Cmp(v.AuxInt < w.AuxInt)
 	}
 	if len(v.Args) != len(w.Args) {
 		return lt2Cmp(len(v.Args) < len(w.Args))
 	}
 	if v.Op == OpPhi && v.Block != w.Block {
 		return lt2Cmp(v.Block.ID < w.Block.ID)
 	}
 	if v.Type.IsMemory() {
 		// We will never be able to CSE two values
 		// that generate memory.
 		return lt2Cmp(v.ID < w.ID)
 	}
 	// OpSelect is a pseudo-op. We need to be more aggressive
 	// regarding CSE to keep multiple OpSelect's of the same
 	// argument from existing.
 	if v.Op != OpSelect0 && v.Op != OpSelect1 && v.Op != OpSelectN {
 		if tc := v.Type.Compare(w.Type); tc != types.CMPeq {
 			return tc
 		}
 	}

 	if v.Aux != w.Aux {
 		if v.Aux == nil {
 			return types.CMPlt
 		}
 		if w.Aux == nil {
 			return types.CMPgt
 		}
 		return lt2Cmp(auxIDs[v.Aux] < auxIDs[w.Aux])
 	}

 	return types.CMPeq
 }

 // Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address.
 // Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
 func isMemUser(v *Value) (int, int, int64, bool) {
 	switch v.Op {
 	case OpLoad:
 		return 0, 1, v.Type.Size(), true
 	case OpNilCheck:
 		return 0, 1, 0, true
 	default:
 		return -1, -1, 0, false
 	}
 }

 // Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions.
 // Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
 // If the access width is 0, the pointer index may be -1 (no pointer operand is needed).
 func isMemDef(v *Value) (int, int, int64, bool) {
 	switch v.Op {
 	case OpStore:
 		return 0, 2, auxToType(v.Aux).Size(), true
 	case OpVarDef:
 		return -1, 0, 0, true
 	case OpZero:
 		return 0, 1, v.AuxInt, true
 	default:
 		return -1, -1, 0, false
 	}
 }

 // Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand
 // and the rest to store the ID of the destination "memory"-producing instruction.
 const memTableSkipBits = 8

 // The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID
 const maxId = ID(1<<(31-memTableSkipBits)) - 1

 // Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped.
 func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) {
 	if code := uint32(memTable[v.ID]); code != 0 {
 		return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1)
 	}
 	if idxPtr, idxMem, width, ok := isMemUser(v); ok {
 		// TODO: We could early return some predefined value if width==0
 		memId := v.Args[idxMem].ID
 		if memId > maxId {
 			return memId, 0
 		}
 		mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width)
 		if mem.ID <= maxId {
 			memId = mem.ID
 		} else {
 			skips = 0 // avoid the skip
 		}
 		memTable[v.ID] = int32(memId<<memTableSkipBits) | int32(skips)
 		return memId, skips
 	} else {
 		v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString())
 	}
 	return 0, 0
 }

 // Find a memory def that's not trivially disjoint with the user instruction, count the number
 // of "skips" along the path. Return the corresponding memory def's value and the number of skips.
 func skipDisjointMemDefs(user *Value, idxUserPtr, idxUserMem int, useWidth int64) (*Value, uint32) {
 	usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem]
 	const maxSkips = (1 << memTableSkipBits) - 1
 	var skips uint32
 	for skips = 0; skips < maxSkips; skips++ {
 		if idxPtr, idxMem, width, ok := isMemDef(mem); ok {
 			if mem.Args[idxMem].Uses > 50 {
 				// Skipping a memory def with a lot of uses may potentially increase register pressure.
 				break
 			}
 			if width == 0 {
 				mem = mem.Args[idxMem]
 				continue
 			}
 			defPtr := mem.Args[idxPtr]
 			if disjoint(defPtr, width, usePtr, useWidth) {
 				mem = mem.Args[idxMem]
 				continue
 			}
 		}
 		break
 	}
 	return mem, skips
 }
	// Copyright 2015 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package ssa

	import (
	"cmd/compile/internal/types"
	"cmd/internal/src"
	"cmp"
	"fmt"
	"slices"
	)

	// cse does common-subexpression elimination on the Function.
	// Values are just relinked, nothing is deleted. A subsequent deadcode
	// pass is required to actually remove duplicate expressions.
	func cse(f *Func) {
	// Two values are equivalent if they satisfy the following definition:
	// equivalent(v, w):
	// v.op == w.op
	// v.type == w.type
	// v.aux == w.aux
	// v.auxint == w.auxint
	// len(v.args) == len(w.args)
	// v.block == w.block if v.op == OpPhi
	// equivalent(v.args[i], w.args[i]) for i in 0..len(v.args)-1

	// The algorithm searches for a partition of f's values into
	// equivalence classes using the above definition.
	// It starts with a coarse partition and iteratively refines it
	// until it reaches a fixed point.

	// Make initial coarse partitions by using a subset of the conditions above.
	a := f.Cache.allocValueSlice(f.NumValues())
	defer func() { f.Cache.freeValueSlice(a) }() // inside closure to use final value of a
	a = a[:0]
	o := f.Cache.allocInt32Slice(f.NumValues()) // the ordering score for stores
	defer func() { f.Cache.freeInt32Slice(o) }()
	if f.auxmap == nil {
	f.auxmap = auxmap{}
	}
	for _, b := range f.Blocks {
	for _, v := range b.Values {
	if v.Type.IsMemory() {
	continue // memory values can never cse
	}
	if f.auxmap[v.Aux] == 0 {
	f.auxmap[v.Aux] = int32(len(f.auxmap)) + 1
	}
	a = append(a, v)
	}
	}
	partition := partitionValues(a, f.auxmap)

	// map from value id back to eqclass id
	valueEqClass := f.Cache.allocIDSlice(f.NumValues())
	defer f.Cache.freeIDSlice(valueEqClass)
	for _, b := range f.Blocks {
	for _, v := range b.Values {
	// Use negative equivalence class #s for unique values.
	valueEqClass[v.ID] = -v.ID
	}
	}
	var pNum ID = 1
	for _, e := range partition {
	if f.pass.debug > 1 && len(e) > 500 {
	fmt.Printf("CSE.large partition (%d): ", len(e))
	for j := 0; j < 3; j++ {
	fmt.Printf("%s ", e[j].LongString())
	}
	fmt.Println()
	}

	for _, v := range e {
	valueEqClass[v.ID] = pNum
	}
	if f.pass.debug > 2 && len(e) > 1 {
	fmt.Printf("CSE.partition #%d:", pNum)
	for _, v := range e {
	fmt.Printf(" %s", v.String())
	}
	fmt.Printf("\n")
	}
	pNum++
	}

	// Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load),
	// to some dominating memory operation, skipping the memory defs that do not alias with it.
	memTable := f.Cache.allocInt32Slice(f.NumValues())
	defer f.Cache.freeInt32Slice(memTable)

	// Split equivalence classes at points where they have
	// non-equivalent arguments. Repeat until we can't find any
	// more splits.
	var splitPoints []int
	for {
	changed := false

	// partition can grow in the loop. By not using a range loop here,
	// we process new additions as they arrive, avoiding O(n^2) behavior.
	for i := 0; i < len(partition); i++ {
	e := partition[i]

	if opcodeTable[e[0].Op].commutative {
	// Order the first two args before comparison.
	for _, v := range e {
	if valueEqClass[v.Args[0].ID] > valueEqClass[v.Args[1].ID] {
	v.Args[0], v.Args[1] = v.Args[1], v.Args[0]
	}
	}
	}

	// Sort by eq class of arguments.
	slices.SortFunc(e, func(v, w *Value) int {
	_, idxMem, _, _ := isMemUser(v)
	for i, a := range v.Args {
	var aId, bId ID
	if i != idxMem {
	b := w.Args[i]
	aId = a.ID
	bId = b.ID
	} else {
	// A memory user's mem argument may be remapped to allow matching
	// identical load-like instructions across disjoint stores.
	aId, _ = getEffectiveMemoryArg(memTable, v)
	bId, _ = getEffectiveMemoryArg(memTable, w)
	}
	if valueEqClass[aId] < valueEqClass[bId] {
	return -1
	}
	if valueEqClass[aId] > valueEqClass[bId] {
	return +1
	}
	}
	return 0
	})

	// Find split points.
	splitPoints = append(splitPoints[:0], 0)
	for j := 1; j < len(e); j++ {
	v, w := e[j-1], e[j]
	// Note: commutative args already correctly ordered by byArgClass.
	eqArgs := true
	_, idxMem, _, _ := isMemUser(v)
	for k, a := range v.Args {
	if v.Op == OpLocalAddr && k == 1 {
	continue
	}
	var aId, bId ID
	if k != idxMem {
	b := w.Args[k]
	aId = a.ID
	bId = b.ID
	} else {
	// A memory user's mem argument may be remapped to allow matching
	// identical load-like instructions across disjoint stores.
	aId, _ = getEffectiveMemoryArg(memTable, v)
	bId, _ = getEffectiveMemoryArg(memTable, w)
	}
	if valueEqClass[aId] != valueEqClass[bId] {
	eqArgs = false
	break
	}
	}
	if !eqArgs {
	splitPoints = append(splitPoints, j)
	}
	}
	if len(splitPoints) == 1 {
	continue // no splits, leave equivalence class alone.
	}

	// Move another equivalence class down in place of e.
	partition[i] = partition[len(partition)-1]
	partition = partition[:len(partition)-1]
	i--

	// Add new equivalence classes for the parts of e we found.
	splitPoints = append(splitPoints, len(e))
	for j := 0; j < len(splitPoints)-1; j++ {
	f := e[splitPoints[j]:splitPoints[j+1]]
	if len(f) == 1 {
	// Don't add singletons.
	valueEqClass[f[0].ID] = -f[0].ID
	continue
	}
	for _, v := range f {
	valueEqClass[v.ID] = pNum
	}
	pNum++
	partition = append(partition, f)
	}
	changed = true
	}

	if !changed {
	break
	}
	}

	sdom := f.Sdom()

	// Compute substitutions we would like to do. We substitute v for w
	// if v and w are in the same equivalence class and v dominates w.
	rewrite := f.Cache.allocValueSlice(f.NumValues())
	defer f.Cache.freeValueSlice(rewrite)
	for _, e := range partition {
	slices.SortFunc(e, func(v, w *Value) int {
	if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 {
	return c
	}
	if _, _, _, ok := isMemUser(v); ok {
	// Additional ordering among the memory users within one block: prefer the earliest
	// possible value among the set of equivalent values, that is the one with the lowest
	// skip count (lowest number of memory defs skipped until their common def).
	_, vSkips := getEffectiveMemoryArg(memTable, v)
	_, wSkips := getEffectiveMemoryArg(memTable, w)
	if c := cmp.Compare(vSkips, wSkips); c != 0 {
	return c
	}
	}
	if v.Op == OpLocalAddr {
	// compare the memory args for OpLocalAddrs in the same block
	vm := v.Args[1]
	wm := w.Args[1]
	if vm == wm {
	return 0
	}
	// if the two OpLocalAddrs are in the same block, and one's memory
	// arg also in the same block, but the other one's memory arg not,
	// the latter must be in an ancestor block
	if vm.Block != v.Block {
	return -1
	}
	if wm.Block != w.Block {
	return +1
	}
	// use store order if the memory args are in the same block
	vs := storeOrdering(vm, o)
	ws := storeOrdering(wm, o)
	if vs <= 0 {
	f.Fatalf("unable to determine the order of %s", vm.LongString())
	}
	if ws <= 0 {
	f.Fatalf("unable to determine the order of %s", wm.LongString())
	}
	return cmp.Compare(vs, ws)
	}
	vStmt := v.Pos.IsStmt() == src.PosIsStmt
	wStmt := w.Pos.IsStmt() == src.PosIsStmt
	if vStmt != wStmt {
	if vStmt {
	return -1
	}
	return +1
	}
	return 0
	})

	for i := 0; i < len(e)-1; i++ {
	// e is sorted by domorder, so a maximal dominant element is first in the slice
	v := e[i]
	if v == nil {
	continue
	}

	e[i] = nil
	// Replace all elements of e which v dominates
	for j := i + 1; j < len(e); j++ {
	w := e[j]
	if w == nil {
	continue
	}
	if sdom.IsAncestorEq(v.Block, w.Block) {
	rewrite[w.ID] = v
	e[j] = nil
	} else {
	// e is sorted by domorder, so v.Block doesn't dominate any subsequent blocks in e
	break
	}
	}
	}
	}

	rewrites := int64(0)

	// Apply substitutions
	for _, b := range f.Blocks {
	for _, v := range b.Values {
	for i, w := range v.Args {
	if x := rewrite[w.ID]; x != nil {
	if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck {
	// about to lose a statement marker, w
	// w is an input to v; if they're in the same block
	// and the same line, v is a good-enough new statement boundary.
	if w.Block == v.Block && w.Pos.Line() == v.Pos.Line() {
	v.Pos = v.Pos.WithIsStmt()
	w.Pos = w.Pos.WithNotStmt()
	} // TODO and if this fails?
	}
	v.SetArg(i, x)
	rewrites++
	}
	}
	}
	for i, v := range b.ControlValues() {
	if x := rewrite[v.ID]; x != nil {
	if v.Op == OpNilCheck {
	// nilcheck pass will remove the nil checks and log
	// them appropriately, so don't mess with them here.
	continue
	}
	b.ReplaceControl(i, x)
	}
	}
	}

	if f.pass.stats > 0 {
	f.LogStat("CSE REWRITES", rewrites)
	}
	}

	// storeOrdering computes the order for stores by iterate over the store
	// chain, assigns a score to each store. The scores only make sense for
	// stores within the same block, and the first store by store order has
	// the lowest score. The cache was used to ensure only compute once.
	func storeOrdering(v *Value, cache []int32) int32 {
	const minScore int32 = 1
	score := minScore
	w := v
	for {
	if s := cache[w.ID]; s >= minScore {
	score += s
	break
	}
	if w.Op == OpPhi \|\| w.Op == OpInitMem {
	break
	}
	a := w.MemoryArg()
	if a.Block != w.Block {
	break
	}
	w = a
	score++
	}
	w = v
	for cache[w.ID] == 0 {
	cache[w.ID] = score
	if score == minScore {
	break
	}
	w = w.MemoryArg()
	score--
	}
	return cache[v.ID]
	}

	// An eqclass approximates an equivalence class. During the
	// algorithm it may represent the union of several of the
	// final equivalence classes.
	type eqclass []*Value

	// partitionValues partitions the values into equivalence classes
	// based on having all the following features match:
	// - opcode
	// - type
	// - auxint
	// - aux
	// - nargs
	// - block # if a phi op
	// - first two arg's opcodes and auxint
	// - NOT first two arg's aux; that can break CSE.
	//
	// partitionValues returns a list of equivalence classes, each
	// being a sorted by ID list of *Values. The eqclass slices are
	// backed by the same storage as the input slice.
	// Equivalence classes of size 1 are ignored.
	func partitionValues(a []*Value, auxIDs auxmap) []eqclass {
	slices.SortFunc(a, func(v, w *Value) int {
	switch cmpVal(v, w, auxIDs) {
	case types.CMPlt:
	return -1
	case types.CMPgt:
	return +1
	default:
	// Sort by value ID last to keep the sort result deterministic.
	return cmp.Compare(v.ID, w.ID)
	}
	})

	var partition []eqclass
	for len(a) > 0 {
	v := a[0]
	j := 1
	for ; j < len(a); j++ {
	w := a[j]
	if cmpVal(v, w, auxIDs) != types.CMPeq {
	break
	}
	}
	if j > 1 {
	partition = append(partition, a[:j])
	}
	a = a[j:]
	}

	return partition
	}
	func lt2Cmp(isLt bool) types.Cmp {
	if isLt {
	return types.CMPlt
	}
	return types.CMPgt
	}

	type auxmap map[Aux]int32

	func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp {
	// Try to order these comparison by cost (cheaper first)
	if v.Op != w.Op {
	return lt2Cmp(v.Op < w.Op)
	}
	if v.AuxInt != w.AuxInt {
	return lt2Cmp(v.AuxInt < w.AuxInt)
	}
	if len(v.Args) != len(w.Args) {
	return lt2Cmp(len(v.Args) < len(w.Args))
	}
	if v.Op == OpPhi && v.Block != w.Block {
	return lt2Cmp(v.Block.ID < w.Block.ID)
	}
	if v.Type.IsMemory() {
	// We will never be able to CSE two values
	// that generate memory.
	return lt2Cmp(v.ID < w.ID)
	}
	// OpSelect is a pseudo-op. We need to be more aggressive
	// regarding CSE to keep multiple OpSelect's of the same
	// argument from existing.
	if v.Op != OpSelect0 && v.Op != OpSelect1 && v.Op != OpSelectN {
	if tc := v.Type.Compare(w.Type); tc != types.CMPeq {
	return tc
	}
	}

	if v.Aux != w.Aux {
	if v.Aux == nil {
	return types.CMPlt
	}
	if w.Aux == nil {
	return types.CMPgt
	}
	return lt2Cmp(auxIDs[v.Aux] < auxIDs[w.Aux])
	}

	return types.CMPeq
	}

	// Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address.
	// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
	func isMemUser(v *Value) (int, int, int64, bool) {
	switch v.Op {
	case OpLoad:
	return 0, 1, v.Type.Size(), true
	case OpNilCheck:
	return 0, 1, 0, true
	default:
	return -1, -1, 0, false
	}
	}

	// Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions.
	// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
	// If the access width is 0, the pointer index may be -1 (no pointer operand is needed).
	func isMemDef(v *Value) (int, int, int64, bool) {
	switch v.Op {
	case OpStore:
	return 0, 2, auxToType(v.Aux).Size(), true
	case OpVarDef:
	return -1, 0, 0, true
	case OpZero:
	return 0, 1, v.AuxInt, true
	default:
	return -1, -1, 0, false
	}
	}

	// Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand
	// and the rest to store the ID of the destination "memory"-producing instruction.
	const memTableSkipBits = 8

	// The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID
	const maxId = ID(1<<(31-memTableSkipBits)) - 1

	// Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped.
	func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) {
	if code := uint32(memTable[v.ID]); code != 0 {
	return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1)
	}
	if idxPtr, idxMem, width, ok := isMemUser(v); ok {
	// TODO: We could early return some predefined value if width==0
	memId := v.Args[idxMem].ID
	if memId > maxId {
	return memId, 0
	}
	mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width)
	if mem.ID <= maxId {
	memId = mem.ID
	} else {
	skips = 0 // avoid the skip
	}
	memTable[v.ID] = int32(memId<<memTableSkipBits) \| int32(skips)
	return memId, skips
	} else {
	v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString())
	}
	return 0, 0
	}

	// Find a memory def that's not trivially disjoint with the user instruction, count the number
	// of "skips" along the path. Return the corresponding memory def's value and the number of skips.
	func skipDisjointMemDefs(user Value, idxUserPtr, idxUserMem int, useWidth int64) (Value, uint32) {
	usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem]
	const maxSkips = (1 << memTableSkipBits) - 1
	var skips uint32
	for skips = 0; skips < maxSkips; skips++ {
	if idxPtr, idxMem, width, ok := isMemDef(mem); ok {
	if mem.Args[idxMem].Uses > 50 {
	// Skipping a memory def with a lot of uses may potentially increase register pressure.
	break
	}
	if width == 0 {
	mem = mem.Args[idxMem]
	continue
	}
	defPtr := mem.Args[idxPtr]
	if disjoint(defPtr, width, usePtr, useWidth) {
	mem = mem.Args[idxMem]
	continue
	}
	}
	break
	}
	return mem, skips
	}