| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package ssa |
| |
| import ( |
| "cmd/compile/internal/types" |
| "cmd/internal/src" |
| "cmp" |
| "fmt" |
| "slices" |
| ) |
| |
| // cse does common-subexpression elimination on the Function. |
| // Values are just relinked, nothing is deleted. A subsequent deadcode |
| // pass is required to actually remove duplicate expressions. |
| func cse(f *Func) { |
| // Two values are equivalent if they satisfy the following definition: |
| // equivalent(v, w): |
| // v.op == w.op |
| // v.type == w.type |
| // v.aux == w.aux |
| // v.auxint == w.auxint |
| // len(v.args) == len(w.args) |
| // v.block == w.block if v.op == OpPhi |
| // equivalent(v.args[i], w.args[i]) for i in 0..len(v.args)-1 |
| |
| // The algorithm searches for a partition of f's values into |
| // equivalence classes using the above definition. |
| // It starts with a coarse partition and iteratively refines it |
| // until it reaches a fixed point. |
| |
| // Make initial coarse partitions by using a subset of the conditions above. |
| a := f.Cache.allocValueSlice(f.NumValues()) |
| defer func() { f.Cache.freeValueSlice(a) }() // inside closure to use final value of a |
| a = a[:0] |
| o := f.Cache.allocInt32Slice(f.NumValues()) // the ordering score for stores |
| defer func() { f.Cache.freeInt32Slice(o) }() |
| if f.auxmap == nil { |
| f.auxmap = auxmap{} |
| } |
| for _, b := range f.Blocks { |
| for _, v := range b.Values { |
| if v.Type.IsMemory() { |
| continue // memory values can never cse |
| } |
| if f.auxmap[v.Aux] == 0 { |
| f.auxmap[v.Aux] = int32(len(f.auxmap)) + 1 |
| } |
| a = append(a, v) |
| } |
| } |
| partition := partitionValues(a, f.auxmap) |
| |
| // map from value id back to eqclass id |
| valueEqClass := f.Cache.allocIDSlice(f.NumValues()) |
| defer f.Cache.freeIDSlice(valueEqClass) |
| for _, b := range f.Blocks { |
| for _, v := range b.Values { |
| // Use negative equivalence class #s for unique values. |
| valueEqClass[v.ID] = -v.ID |
| } |
| } |
| var pNum ID = 1 |
| for _, e := range partition { |
| if f.pass.debug > 1 && len(e) > 500 { |
| fmt.Printf("CSE.large partition (%d): ", len(e)) |
| for j := 0; j < 3; j++ { |
| fmt.Printf("%s ", e[j].LongString()) |
| } |
| fmt.Println() |
| } |
| |
| for _, v := range e { |
| valueEqClass[v.ID] = pNum |
| } |
| if f.pass.debug > 2 && len(e) > 1 { |
| fmt.Printf("CSE.partition #%d:", pNum) |
| for _, v := range e { |
| fmt.Printf(" %s", v.String()) |
| } |
| fmt.Printf("\n") |
| } |
| pNum++ |
| } |
| |
| // Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load), |
| // to some dominating memory operation, skipping the memory defs that do not alias with it. |
| memTable := f.Cache.allocInt32Slice(f.NumValues()) |
| defer f.Cache.freeInt32Slice(memTable) |
| |
| // Split equivalence classes at points where they have |
| // non-equivalent arguments. Repeat until we can't find any |
| // more splits. |
| var splitPoints []int |
| for { |
| changed := false |
| |
| // partition can grow in the loop. By not using a range loop here, |
| // we process new additions as they arrive, avoiding O(n^2) behavior. |
| for i := 0; i < len(partition); i++ { |
| e := partition[i] |
| |
| if opcodeTable[e[0].Op].commutative { |
| // Order the first two args before comparison. |
| for _, v := range e { |
| if valueEqClass[v.Args[0].ID] > valueEqClass[v.Args[1].ID] { |
| v.Args[0], v.Args[1] = v.Args[1], v.Args[0] |
| } |
| } |
| } |
| |
| // Sort by eq class of arguments. |
| slices.SortFunc(e, func(v, w *Value) int { |
| _, idxMem, _, _ := isMemUser(v) |
| for i, a := range v.Args { |
| var aId, bId ID |
| if i != idxMem { |
| b := w.Args[i] |
| aId = a.ID |
| bId = b.ID |
| } else { |
| // A memory user's mem argument may be remapped to allow matching |
| // identical load-like instructions across disjoint stores. |
| aId, _ = getEffectiveMemoryArg(memTable, v) |
| bId, _ = getEffectiveMemoryArg(memTable, w) |
| } |
| if valueEqClass[aId] < valueEqClass[bId] { |
| return -1 |
| } |
| if valueEqClass[aId] > valueEqClass[bId] { |
| return +1 |
| } |
| } |
| return 0 |
| }) |
| |
| // Find split points. |
| splitPoints = append(splitPoints[:0], 0) |
| for j := 1; j < len(e); j++ { |
| v, w := e[j-1], e[j] |
| // Note: commutative args already correctly ordered by byArgClass. |
| eqArgs := true |
| _, idxMem, _, _ := isMemUser(v) |
| for k, a := range v.Args { |
| if v.Op == OpLocalAddr && k == 1 { |
| continue |
| } |
| var aId, bId ID |
| if k != idxMem { |
| b := w.Args[k] |
| aId = a.ID |
| bId = b.ID |
| } else { |
| // A memory user's mem argument may be remapped to allow matching |
| // identical load-like instructions across disjoint stores. |
| aId, _ = getEffectiveMemoryArg(memTable, v) |
| bId, _ = getEffectiveMemoryArg(memTable, w) |
| } |
| if valueEqClass[aId] != valueEqClass[bId] { |
| eqArgs = false |
| break |
| } |
| } |
| if !eqArgs { |
| splitPoints = append(splitPoints, j) |
| } |
| } |
| if len(splitPoints) == 1 { |
| continue // no splits, leave equivalence class alone. |
| } |
| |
| // Move another equivalence class down in place of e. |
| partition[i] = partition[len(partition)-1] |
| partition = partition[:len(partition)-1] |
| i-- |
| |
| // Add new equivalence classes for the parts of e we found. |
| splitPoints = append(splitPoints, len(e)) |
| for j := 0; j < len(splitPoints)-1; j++ { |
| f := e[splitPoints[j]:splitPoints[j+1]] |
| if len(f) == 1 { |
| // Don't add singletons. |
| valueEqClass[f[0].ID] = -f[0].ID |
| continue |
| } |
| for _, v := range f { |
| valueEqClass[v.ID] = pNum |
| } |
| pNum++ |
| partition = append(partition, f) |
| } |
| changed = true |
| } |
| |
| if !changed { |
| break |
| } |
| } |
| |
| sdom := f.Sdom() |
| |
| // Compute substitutions we would like to do. We substitute v for w |
| // if v and w are in the same equivalence class and v dominates w. |
| rewrite := f.Cache.allocValueSlice(f.NumValues()) |
| defer f.Cache.freeValueSlice(rewrite) |
| for _, e := range partition { |
| slices.SortFunc(e, func(v, w *Value) int { |
| if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 { |
| return c |
| } |
| if _, _, _, ok := isMemUser(v); ok { |
| // Additional ordering among the memory users within one block: prefer the earliest |
| // possible value among the set of equivalent values, that is the one with the lowest |
| // skip count (lowest number of memory defs skipped until their common def). |
| _, vSkips := getEffectiveMemoryArg(memTable, v) |
| _, wSkips := getEffectiveMemoryArg(memTable, w) |
| if c := cmp.Compare(vSkips, wSkips); c != 0 { |
| return c |
| } |
| } |
| if v.Op == OpLocalAddr { |
| // compare the memory args for OpLocalAddrs in the same block |
| vm := v.Args[1] |
| wm := w.Args[1] |
| if vm == wm { |
| return 0 |
| } |
| // if the two OpLocalAddrs are in the same block, and one's memory |
| // arg also in the same block, but the other one's memory arg not, |
| // the latter must be in an ancestor block |
| if vm.Block != v.Block { |
| return -1 |
| } |
| if wm.Block != w.Block { |
| return +1 |
| } |
| // use store order if the memory args are in the same block |
| vs := storeOrdering(vm, o) |
| ws := storeOrdering(wm, o) |
| if vs <= 0 { |
| f.Fatalf("unable to determine the order of %s", vm.LongString()) |
| } |
| if ws <= 0 { |
| f.Fatalf("unable to determine the order of %s", wm.LongString()) |
| } |
| return cmp.Compare(vs, ws) |
| } |
| vStmt := v.Pos.IsStmt() == src.PosIsStmt |
| wStmt := w.Pos.IsStmt() == src.PosIsStmt |
| if vStmt != wStmt { |
| if vStmt { |
| return -1 |
| } |
| return +1 |
| } |
| return 0 |
| }) |
| |
| for i := 0; i < len(e)-1; i++ { |
| // e is sorted by domorder, so a maximal dominant element is first in the slice |
| v := e[i] |
| if v == nil { |
| continue |
| } |
| |
| e[i] = nil |
| // Replace all elements of e which v dominates |
| for j := i + 1; j < len(e); j++ { |
| w := e[j] |
| if w == nil { |
| continue |
| } |
| if sdom.IsAncestorEq(v.Block, w.Block) { |
| rewrite[w.ID] = v |
| e[j] = nil |
| } else { |
| // e is sorted by domorder, so v.Block doesn't dominate any subsequent blocks in e |
| break |
| } |
| } |
| } |
| } |
| |
| rewrites := int64(0) |
| |
| // Apply substitutions |
| for _, b := range f.Blocks { |
| for _, v := range b.Values { |
| for i, w := range v.Args { |
| if x := rewrite[w.ID]; x != nil { |
| if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck { |
| // about to lose a statement marker, w |
| // w is an input to v; if they're in the same block |
| // and the same line, v is a good-enough new statement boundary. |
| if w.Block == v.Block && w.Pos.Line() == v.Pos.Line() { |
| v.Pos = v.Pos.WithIsStmt() |
| w.Pos = w.Pos.WithNotStmt() |
| } // TODO and if this fails? |
| } |
| v.SetArg(i, x) |
| rewrites++ |
| } |
| } |
| } |
| for i, v := range b.ControlValues() { |
| if x := rewrite[v.ID]; x != nil { |
| if v.Op == OpNilCheck { |
| // nilcheck pass will remove the nil checks and log |
| // them appropriately, so don't mess with them here. |
| continue |
| } |
| b.ReplaceControl(i, x) |
| } |
| } |
| } |
| |
| if f.pass.stats > 0 { |
| f.LogStat("CSE REWRITES", rewrites) |
| } |
| } |
| |
| // storeOrdering computes the order for stores by iterate over the store |
| // chain, assigns a score to each store. The scores only make sense for |
| // stores within the same block, and the first store by store order has |
| // the lowest score. The cache was used to ensure only compute once. |
| func storeOrdering(v *Value, cache []int32) int32 { |
| const minScore int32 = 1 |
| score := minScore |
| w := v |
| for { |
| if s := cache[w.ID]; s >= minScore { |
| score += s |
| break |
| } |
| if w.Op == OpPhi || w.Op == OpInitMem { |
| break |
| } |
| a := w.MemoryArg() |
| if a.Block != w.Block { |
| break |
| } |
| w = a |
| score++ |
| } |
| w = v |
| for cache[w.ID] == 0 { |
| cache[w.ID] = score |
| if score == minScore { |
| break |
| } |
| w = w.MemoryArg() |
| score-- |
| } |
| return cache[v.ID] |
| } |
| |
| // An eqclass approximates an equivalence class. During the |
| // algorithm it may represent the union of several of the |
| // final equivalence classes. |
| type eqclass []*Value |
| |
| // partitionValues partitions the values into equivalence classes |
| // based on having all the following features match: |
| // - opcode |
| // - type |
| // - auxint |
| // - aux |
| // - nargs |
| // - block # if a phi op |
| // - first two arg's opcodes and auxint |
| // - NOT first two arg's aux; that can break CSE. |
| // |
| // partitionValues returns a list of equivalence classes, each |
| // being a sorted by ID list of *Values. The eqclass slices are |
| // backed by the same storage as the input slice. |
| // Equivalence classes of size 1 are ignored. |
| func partitionValues(a []*Value, auxIDs auxmap) []eqclass { |
| slices.SortFunc(a, func(v, w *Value) int { |
| switch cmpVal(v, w, auxIDs) { |
| case types.CMPlt: |
| return -1 |
| case types.CMPgt: |
| return +1 |
| default: |
| // Sort by value ID last to keep the sort result deterministic. |
| return cmp.Compare(v.ID, w.ID) |
| } |
| }) |
| |
| var partition []eqclass |
| for len(a) > 0 { |
| v := a[0] |
| j := 1 |
| for ; j < len(a); j++ { |
| w := a[j] |
| if cmpVal(v, w, auxIDs) != types.CMPeq { |
| break |
| } |
| } |
| if j > 1 { |
| partition = append(partition, a[:j]) |
| } |
| a = a[j:] |
| } |
| |
| return partition |
| } |
| func lt2Cmp(isLt bool) types.Cmp { |
| if isLt { |
| return types.CMPlt |
| } |
| return types.CMPgt |
| } |
| |
| type auxmap map[Aux]int32 |
| |
| func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp { |
| // Try to order these comparison by cost (cheaper first) |
| if v.Op != w.Op { |
| return lt2Cmp(v.Op < w.Op) |
| } |
| if v.AuxInt != w.AuxInt { |
| return lt2Cmp(v.AuxInt < w.AuxInt) |
| } |
| if len(v.Args) != len(w.Args) { |
| return lt2Cmp(len(v.Args) < len(w.Args)) |
| } |
| if v.Op == OpPhi && v.Block != w.Block { |
| return lt2Cmp(v.Block.ID < w.Block.ID) |
| } |
| if v.Type.IsMemory() { |
| // We will never be able to CSE two values |
| // that generate memory. |
| return lt2Cmp(v.ID < w.ID) |
| } |
| // OpSelect is a pseudo-op. We need to be more aggressive |
| // regarding CSE to keep multiple OpSelect's of the same |
| // argument from existing. |
| if v.Op != OpSelect0 && v.Op != OpSelect1 && v.Op != OpSelectN { |
| if tc := v.Type.Compare(w.Type); tc != types.CMPeq { |
| return tc |
| } |
| } |
| |
| if v.Aux != w.Aux { |
| if v.Aux == nil { |
| return types.CMPlt |
| } |
| if w.Aux == nil { |
| return types.CMPgt |
| } |
| return lt2Cmp(auxIDs[v.Aux] < auxIDs[w.Aux]) |
| } |
| |
| return types.CMPeq |
| } |
| |
| // Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address. |
| // Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). |
| func isMemUser(v *Value) (int, int, int64, bool) { |
| switch v.Op { |
| case OpLoad: |
| return 0, 1, v.Type.Size(), true |
| case OpNilCheck: |
| return 0, 1, 0, true |
| default: |
| return -1, -1, 0, false |
| } |
| } |
| |
| // Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions. |
| // Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). |
| // If the access width is 0, the pointer index may be -1 (no pointer operand is needed). |
| func isMemDef(v *Value) (int, int, int64, bool) { |
| switch v.Op { |
| case OpStore: |
| return 0, 2, auxToType(v.Aux).Size(), true |
| case OpVarDef: |
| return -1, 0, 0, true |
| case OpZero: |
| return 0, 1, v.AuxInt, true |
| default: |
| return -1, -1, 0, false |
| } |
| } |
| |
| // Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand |
| // and the rest to store the ID of the destination "memory"-producing instruction. |
| const memTableSkipBits = 8 |
| |
| // The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID |
| const maxId = ID(1<<(31-memTableSkipBits)) - 1 |
| |
| // Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped. |
| func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) { |
| if code := uint32(memTable[v.ID]); code != 0 { |
| return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1) |
| } |
| if idxPtr, idxMem, width, ok := isMemUser(v); ok { |
| // TODO: We could early return some predefined value if width==0 |
| memId := v.Args[idxMem].ID |
| if memId > maxId { |
| return memId, 0 |
| } |
| mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width) |
| if mem.ID <= maxId { |
| memId = mem.ID |
| } else { |
| skips = 0 // avoid the skip |
| } |
| memTable[v.ID] = int32(memId<<memTableSkipBits) | int32(skips) |
| return memId, skips |
| } else { |
| v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString()) |
| } |
| return 0, 0 |
| } |
| |
| // Find a memory def that's not trivially disjoint with the user instruction, count the number |
| // of "skips" along the path. Return the corresponding memory def's value and the number of skips. |
| func skipDisjointMemDefs(user *Value, idxUserPtr, idxUserMem int, useWidth int64) (*Value, uint32) { |
| usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem] |
| const maxSkips = (1 << memTableSkipBits) - 1 |
| var skips uint32 |
| for skips = 0; skips < maxSkips; skips++ { |
| if idxPtr, idxMem, width, ok := isMemDef(mem); ok { |
| if mem.Args[idxMem].Uses > 50 { |
| // Skipping a memory def with a lot of uses may potentially increase register pressure. |
| break |
| } |
| if width == 0 { |
| mem = mem.Args[idxMem] |
| continue |
| } |
| defPtr := mem.Args[idxPtr] |
| if disjoint(defPtr, width, usePtr, useWidth) { |
| mem = mem.Args[idxMem] |
| continue |
| } |
| } |
| break |
| } |
| return mem, skips |
| } |