cmd/compile/internal: stack slot merging region formation enhancements

This patch revises the algorithm/strategy used for overlapping the
stack slots of disjointly accessed local variables. The main change
here is to allow merging the stack slot of B into the slot for A if
B's size is less then A (prior to this they had to be identical), and
to also allow merging a non-pointer variables into pointer-variable
slots.

The new algorithm sorts the candidate list first by pointerness
(pointer variables first), then by alignment, then by size, and
finally by name. We no longer check that two variables have the same
GC shape before merging: since it should never be the case that we
have two vars X and Y both live across a given callsite where X and Y
share a stack slot, their gc shape doesn't matter.

Doing things this new way increases the total number of bytes saved
(across all functions) from 91256 to 124336 for the sweet benchmarks.

Updates #62737.
Updates #65532.
Updates #65495.

Change-Id: I1daaac1b1240aa47a6975e98ccd24e03304ab602
Reviewed-on: https://go-review.googlesource.com/c/go/+/577615
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/liveness/mergelocals.go b/src/cmd/compile/internal/liveness/mergelocals.go
index aae57cb..1e65d6c 100644
--- a/src/cmd/compile/internal/liveness/mergelocals.go
+++ b/src/cmd/compile/internal/liveness/mergelocals.go
@@ -8,9 +8,7 @@
 	"cmd/compile/internal/base"
 	"cmd/compile/internal/bitvec"
 	"cmd/compile/internal/ir"
-	"cmd/compile/internal/reflectdata"
 	"cmd/compile/internal/ssa"
-	"cmd/internal/obj"
 	"cmd/internal/src"
 	"fmt"
 	"os"
@@ -23,12 +21,14 @@
 // (stack-allocated) variables within a function can be safely
 // merged/overlapped, e.g. share a stack slot with some other auto).
 // An instance of MergeLocalsState is produced by MergeLocals() below
-// and then consumed in ssagen.AllocFrame. The map 'partition' contains
-// entries of the form <N,SL> where N is an *ir.Name and SL is a slice
-// holding the indices (within 'vars') of other variables that share the
-// same slot. For example, if a function contains five variables where
-// v1/v2/v3 are safe to overlap and v4/v5 are safe to overlap, the
-// MergeLocalsState content might look like
+// and then consumed in ssagen.AllocFrame. The map 'partition'
+// contains entries of the form <N,SL> where N is an *ir.Name and SL
+// is a slice holding the indices (within 'vars') of other variables
+// that share the same slot, specifically the slot of the first
+// element in the partition, which we'll call the "leader". For
+// example, if a function contains five variables where v1/v2/v3 are
+// safe to overlap and v4/v5 are safe to overlap, the MergeLocalsState
+// content might look like
 //
 //	vars: [v1, v2, v3, v4, v5]
 //	partition: v1 -> [1, 0, 2], v2 -> [1, 0, 2], v3 -> [1, 0, 2]
@@ -49,6 +49,22 @@
 	st, en int
 }
 
+// cstate holds state information we'll need during the analysis
+// phase of stack slot merging but can be discarded when the analysis
+// is done.
+type cstate struct {
+	fn             *ir.Func
+	f              *ssa.Func
+	lv             *liveness
+	cands          []*ir.Name
+	nameToSlot     map[*ir.Name]int32
+	regions        []candRegion
+	indirectUE     map[ssa.ID][]*ir.Name
+	ivs            []Intervals
+	hashDeselected map[*ir.Name]bool
+	trace          int // debug trace level
+}
+
 // MergeLocals analyzes the specified ssa function f to determine which
 // of its auto variables can safely share the same stack slot, returning
 // a state object that describes how the overlap should be done.
@@ -223,6 +239,19 @@
 		if !foundk {
 			return fmt.Errorf("k=%s v=+%v slice value missing k", k.Sym().Name, sl)
 		}
+		vl := mls.vars[sl[0]]
+		for _, v := range sl[1:] {
+			vv := mls.vars[v]
+			if vv.Type().Size() > vl.Type().Size() {
+				return fmt.Errorf("k=%s v=+%v follower %s size %d larger than leader %s size %d", k.Sym().Name, sl, vv.Sym().Name, vv.Type().Size(), vl.Sym().Name, vl.Type().Size())
+			}
+			if vv.Type().HasPointers() && !vl.Type().HasPointers() {
+				return fmt.Errorf("k=%s v=+%v follower %s hasptr=true but leader %s hasptr=false", k.Sym().Name, sl, vv.Sym().Name, vl.Sym().Name)
+			}
+			if vv.Type().Alignment() > vl.Type().Alignment() {
+				return fmt.Errorf("k=%s v=+%v follower %s align %d greater than leader %s align %d", k.Sym().Name, sl, vv.Sym().Name, vv.Type().Alignment(), vl.Sym().Name, vl.Type().Alignment())
+			}
+		}
 	}
 	for i := range used {
 		if !used[i] {
@@ -296,14 +325,13 @@
 
 	// Now generate an initial pruned candidate list and regions list.
 	// This may be empty if we don't have enough compatible candidates.
-	initial, _ := genRegions(cands)
+	initial, _ := cs.genRegions(cands)
 	if len(initial) < 2 {
 		return
 	}
 
-	// When bisecting it can be handy to see debug trace output for
-	// only those functions that hashdebug selects; set this up here.
-	cs.setupHashTrace(initial)
+	// Set up for hash bisection if enabled.
+	cs.setupHashBisection(initial)
 
 	// Create and populate an indirect use table that we'll use
 	// during interval construction. As part of this process we may
@@ -330,7 +358,9 @@
 	}
 }
 
-func genRegions(cands []*ir.Name) ([]*ir.Name, []candRegion) {
+// genRegions generates a set of regions within cands corresponding
+// to potentially overlappable/mergeable variables.
+func (cs *cstate) genRegions(cands []*ir.Name) ([]*ir.Name, []candRegion) {
 	var pruned []*ir.Name
 	var regions []candRegion
 	st := 0
@@ -346,8 +376,8 @@
 		}
 		pst := len(pruned)
 		pen := pst + (en - st)
-		if base.Debug.MergeLocalsTrace > 1 {
-			fmt.Fprintf(os.Stderr, "=-= add part %d -> %d\n", pst, pen)
+		if cs.trace > 1 {
+			fmt.Fprintf(os.Stderr, "=-= addregion st=%d en=%d: add part %d -> %d\n", st, en, pst, pen)
 		}
 
 		// non-empty region, add to pruned
@@ -385,27 +415,29 @@
 	cs.dumpFunc()
 }
 
-func (cs *cstate) setupHashTrace(cands []*ir.Name) {
-	if base.Debug.MergeLocalsHTrace == 0 || base.Debug.MergeLocalsHash == "" {
+// setupHashBisection checks to see if any of the candidate
+// variables have been de-selected by our hash debug. Here
+// we also implement the -d=mergelocalshtrace flag, which turns
+// on debug tracing only if we have at least two candidates
+// selected by the hash debug for this function.
+func (cs *cstate) setupHashBisection(cands []*ir.Name) {
+	if base.Debug.MergeLocalsHash == "" {
 		return
 	}
-
-	// With this trace variant, check to see whether any of the
-	// candidates are selected-- if yes then enable tracing. Hack:
-	// create a new hashdebug with verbosity turned off and use that
-	// to test, so as not to confuse bisect.
-	modified := strings.ReplaceAll(base.Debug.MergeLocalsHash, "v", "q")
-	quiethd := base.NewHashDebug("qmergelocals", modified, nil)
-	found := false
+	deselected := make(map[*ir.Name]bool)
+	selCount := 0
 	for _, cand := range cands {
-		if !quiethd.MatchPosWithInfo(cand.Pos(), "quiet", nil) {
-			found = true
-			fmt.Fprintf(os.Stderr, "=-= MergeLocalsHTrace fn=%v n=%s match\n",
-				cs.fn, cand.Sym().Name)
-			break
+		if !base.MergeLocalsHash.MatchPosWithInfo(cand.Pos(), "mergelocals", nil) {
+			deselected[cand] = true
+		} else {
+			deselected[cand] = false
+			selCount++
 		}
 	}
-	if found {
+	if selCount < len(cands) {
+		cs.hashDeselected = deselected
+	}
+	if base.Debug.MergeLocalsHTrace != 0 && selCount >= 2 {
 		cs.trace = base.Debug.MergeLocalsHTrace
 	}
 }
@@ -566,7 +598,7 @@
 		return nameLess(pruned[i], pruned[j])
 	})
 	var regions []candRegion
-	pruned, regions = genRegions(pruned)
+	pruned, regions = cs.genRegions(pruned)
 	if len(pruned) < 2 {
 		return nil, nil
 	}
@@ -586,29 +618,30 @@
 	count int32
 }
 
-// nameLess compares ci with cj to see if ci should be less than cj
-// in a relative ordering of candidate variables. This is used to
-// sort vars by size, pointerness, and GC shape.
+// nameLess compares ci with cj to see if ci should be less than cj in
+// a relative ordering of candidate variables. This is used to sort
+// vars by pointerness (variables with pointers first), then in order
+// of decreasing alignment, then by decreasing size. We are assuming a
+// merging algorithm that merges later entries in the list into
+// earlier entries. An example ordered candidate list produced by
+// nameLess:
+//
+//	idx   name    type       align    size
+//	0:    abc     [10]*int   8        80
+//	1:    xyz     [9]*int    8        72
+//	2:    qrs     [2]*int    8        16
+//	3:    tuv     [9]int     8        72
+//	4:    wxy     [9]int32   4        36
+//	5:    jkl     [8]int32   4        32
 func nameLess(ci, cj *ir.Name) bool {
-	ihp, jhp := 0, 0
-	var ilsym, jlsym *obj.LSym
-	if ci.Type().HasPointers() {
-		ihp = 1
-		ilsym, _, _ = reflectdata.GCSym(ci.Type())
+	if ci.Type().HasPointers() != cj.Type().HasPointers() {
+		return ci.Type().HasPointers()
 	}
-	if cj.Type().HasPointers() {
-		jhp = 1
-		jlsym, _, _ = reflectdata.GCSym(cj.Type())
-	}
-	if ihp != jhp {
-		return ihp < jhp
+	if ci.Type().Alignment() != cj.Type().Alignment() {
+		return cj.Type().Alignment() < ci.Type().Alignment()
 	}
 	if ci.Type().Size() != cj.Type().Size() {
-		return ci.Type().Size() < cj.Type().Size()
-	}
-	if ihp != 0 && jhp != 0 && ilsym != jlsym {
-		// FIXME: find less clunky way to do this
-		return fmt.Sprintf("%v", ilsym) < fmt.Sprintf("%v", jlsym)
+		return cj.Type().Size() < ci.Type().Size()
 	}
 	if ci.Sym().Name != cj.Sym().Name {
 		return ci.Sym().Name < cj.Sym().Name
@@ -617,55 +650,34 @@
 }
 
 // nextRegion starts at location idx and walks forward in the cands
-// slice looking for variables that are "compatible" (overlappable)
-// with the variable at position idx; it returns the end of the new
-// region (range of compatible variables starting at idx).
+// slice looking for variables that are "compatible" (potentially
+// overlappable, in the sense that they could potentially share the
+// stack slot of cands[idx]); it returns the end of the new region
+// (range of compatible variables starting at idx).
 func nextRegion(cands []*ir.Name, idx int) int {
 	n := len(cands)
 	if idx >= n {
 		return -1
 	}
 	c0 := cands[idx]
-	hp0 := c0.Type().HasPointers()
+	szprev := c0.Type().Size()
+	alnprev := c0.Type().Alignment()
 	for j := idx + 1; j < n; j++ {
 		cj := cands[j]
-		hpj := cj.Type().HasPointers()
-		ok := true
-		if hp0 {
-			if !hpj || c0.Type().Size() != cj.Type().Size() {
-				return j - 1
-			}
-			// GC shape must match if both types have pointers.
-			gcsym0, _, _ := reflectdata.GCSym(c0.Type())
-			gcsymj, _, _ := reflectdata.GCSym(cj.Type())
-			if gcsym0 != gcsymj {
-				return j - 1
-			}
-		} else {
-			// If no pointers, match size only.
-			if !ok || hp0 != hpj || c0.Type().Size() != cj.Type().Size() {
-				return j - 1
-			}
+		szj := cj.Type().Size()
+		if szj > szprev {
+			return j - 1
 		}
+		alnj := cj.Type().Alignment()
+		if alnj > alnprev {
+			return j - 1
+		}
+		szprev = szj
+		alnprev = alnj
 	}
 	return n - 1
 }
 
-// cstate holds state information we'll need during the analysis
-// phase of stack slot merging but can be discarded when the analysis
-// is done.
-type cstate struct {
-	fn         *ir.Func
-	f          *ssa.Func
-	lv         *liveness
-	cands      []*ir.Name
-	nameToSlot map[*ir.Name]int32
-	regions    []candRegion
-	indirectUE map[ssa.ID][]*ir.Name
-	ivs        []Intervals
-	trace      int // debug trace level
-}
-
 // mergeVisitRegion tries to perform overlapping of variables with a
 // given subrange of cands described by st and en (indices into our
 // candidate var list), where the variables within this range have
@@ -673,7 +685,13 @@
 // size, etc. Overlapping is done in a a greedy fashion: we select the
 // first element in the st->en range, then walk the rest of the
 // elements adding in vars whose lifetimes don't overlap with the
-// first element, then repeat the process until we run out of work to do.
+// first element, then repeat the process until we run out of work.
+// Ordering of the candidates within the region [st,en] is important;
+// within the list the assumption is that if we overlap two variables
+// X and Y where X precedes Y in the list, we need to make X the
+// "leader" (keep X's slot and set Y's frame offset to X's) as opposed
+// to the other way around, since it's possible that Y is smaller in
+// size than X.
 func (cs *cstate) mergeVisitRegion(mls *MergeLocalsState, st, en int) {
 	if cs.trace > 1 {
 		fmt.Fprintf(os.Stderr, "=-= mergeVisitRegion(st=%d, en=%d)\n", st, en)
@@ -712,10 +730,8 @@
 		for succ := nxt(leader + 1); succ != -1; succ = nxt(succ + 1) {
 
 			// Skip if de-selected by merge locals hash.
-			if base.Debug.MergeLocalsHash != "" {
-				if !base.MergeLocalsHash.MatchPosWithInfo(cands[succ].Pos(), "mergelocals", nil) {
-					continue
-				}
+			if cs.hashDeselected != nil && cs.hashDeselected[cands[succ]] {
+				continue
 			}
 			// Skip if already used.
 			if used.Get(int32(succ - st)) {
@@ -1004,9 +1020,9 @@
 }
 
 func dumpCand(c *ir.Name, i int) {
-	fmt.Fprintf(os.Stderr, " %d: %s %q sz=%d hp=%v t=%v\n",
+	fmt.Fprintf(os.Stderr, " %d: %s %q sz=%d hp=%v align=%d t=%v\n",
 		i, fmtFullPos(c.Pos()), c.Sym().Name, c.Type().Size(),
-		c.Type().HasPointers(), c.Type())
+		c.Type().HasPointers(), c.Type().Alignment(), c.Type())
 }
 
 // for unit testing only.
diff --git a/src/cmd/compile/internal/ssagen/pgen.go b/src/cmd/compile/internal/ssagen/pgen.go
index f8d1ce8..5b57c8a 100644
--- a/src/cmd/compile/internal/ssagen/pgen.go
+++ b/src/cmd/compile/internal/ssagen/pgen.go
@@ -23,7 +23,7 @@
 )
 
 // cmpstackvarlt reports whether the stack variable a sorts before b.
-func cmpstackvarlt(a, b *ir.Name) bool {
+func cmpstackvarlt(a, b *ir.Name, mls *liveness.MergeLocalsState) bool {
 	// Sort non-autos before autos.
 	if needAlloc(a) != needAlloc(b) {
 		return needAlloc(b)
@@ -37,6 +37,15 @@
 
 	// From here on, a and b are both autos (i.e., local variables).
 
+	// Sort followers after leaders, if mls != nil
+	if mls != nil {
+		aFollow := mls.Subsumed(a)
+		bFollow := mls.Subsumed(b)
+		if aFollow != bFollow {
+			return bFollow
+		}
+	}
+
 	// Sort used before unused (so AllocFrame can truncate unused
 	// variables).
 	if a.Used() != b.Used() {
@@ -153,6 +162,7 @@
 	}
 
 	var mls *liveness.MergeLocalsState
+	var leaders map[*ir.Name]int64
 	if base.Debug.MergeLocals != 0 {
 		mls = liveness.MergeLocals(fn, f)
 		if base.Debug.MergeLocalsTrace > 0 && mls != nil {
@@ -163,31 +173,46 @@
 					fn, mls)
 			}
 		}
+		leaders = make(map[*ir.Name]int64)
 	}
 
 	// Use sort.SliceStable instead of sort.Slice so stack layout (and thus
 	// compiler output) is less sensitive to frontend changes that
 	// introduce or remove unused variables.
 	sort.SliceStable(fn.Dcl, func(i, j int) bool {
-		return cmpstackvarlt(fn.Dcl[i], fn.Dcl[j])
+		return cmpstackvarlt(fn.Dcl[i], fn.Dcl[j], mls)
 	})
 
+	if mls != nil {
+		// Rewrite fn.Dcl to reposition followers (subsumed vars) to
+		// be immediately following the leader var in their partition.
+		followers := []*ir.Name{}
+		newdcl := make([]*ir.Name, 0, len(fn.Dcl))
+		for i := 0; i < len(fn.Dcl); i++ {
+			n := fn.Dcl[i]
+			if mls.Subsumed(n) {
+				continue
+			}
+			newdcl = append(newdcl, n)
+			if mls.IsLeader(n) {
+				followers = mls.Followers(n, followers)
+				// position followers immediately after leader
+				newdcl = append(newdcl, followers...)
+			}
+		}
+		fn.Dcl = newdcl
+	}
+
 	if base.Debug.MergeLocalsTrace > 1 && mls != nil {
 		fmt.Fprintf(os.Stderr, "=-= sorted DCL for %v:\n", fn)
 		for i, v := range fn.Dcl {
 			if !ssa.IsMergeCandidate(v) {
 				continue
 			}
-			fmt.Fprintf(os.Stderr, " %d: %q isleader=%v subsumed=%v used=%v\n", i, v.Sym().Name, mls.IsLeader(v), mls.Subsumed(v), v.Used())
-
+			fmt.Fprintf(os.Stderr, " %d: %q isleader=%v subsumed=%v used=%v sz=%d align=%d t=%s\n", i, v.Sym().Name, mls.IsLeader(v), mls.Subsumed(v), v.Used(), v.Type().Size(), v.Type().Alignment(), v.Type().String())
 		}
 	}
 
-	var leaders map[*ir.Name]int64
-	if mls != nil {
-		leaders = make(map[*ir.Name]int64)
-	}
-
 	// Reassign stack offsets of the locals that are used.
 	lastHasPtr := false
 	for i, n := range fn.Dcl {
@@ -233,39 +258,31 @@
 	}
 
 	if mls != nil {
-		followers := []*ir.Name{}
-		newdcl := make([]*ir.Name, 0, len(fn.Dcl))
+		// Update offsets of followers (subsumed vars) to be the
+		// same as the leader var in their partition.
 		for i := 0; i < len(fn.Dcl); i++ {
 			n := fn.Dcl[i]
-			if mls.Subsumed(n) {
+			if !mls.Subsumed(n) {
 				continue
 			}
-			newdcl = append(newdcl, n)
-			if off, ok := leaders[n]; ok {
-				followers = mls.Followers(n, followers)
-				for _, f := range followers {
-					// Set the stack offset for each follower to be
-					// the same as the leader.
-					f.SetFrameOffset(off)
-				}
-				// position followers immediately after leader
-				newdcl = append(newdcl, followers...)
+			leader := mls.Leader(n)
+			off, ok := leaders[leader]
+			if !ok {
+				panic("internal error missing leader")
 			}
+			// Set the stack offset this subsumed (followed) var
+			// to be the same as the leader.
+			n.SetFrameOffset(off)
 		}
-		fn.Dcl = newdcl
-	}
 
-	if base.Debug.MergeLocalsTrace > 1 {
-		prolog := false
-		for i, v := range fn.Dcl {
-			if v.Op() != ir.ONAME || (v.Class != ir.PAUTO && !(v.Class == ir.PPARAMOUT && v.IsOutputParamInRegisters())) {
-				continue
+		if base.Debug.MergeLocalsTrace > 1 {
+			fmt.Fprintf(os.Stderr, "=-= stack layout for %v:\n", fn)
+			for i, v := range fn.Dcl {
+				if v.Op() != ir.ONAME || (v.Class != ir.PAUTO && !(v.Class == ir.PPARAMOUT && v.IsOutputParamInRegisters())) {
+					continue
+				}
+				fmt.Fprintf(os.Stderr, " %d: %q frameoff %d isleader=%v subsumed=%v sz=%d align=%d t=%s\n", i, v.Sym().Name, v.FrameOffset(), mls.IsLeader(v), mls.Subsumed(v), v.Type().Size(), v.Type().Alignment(), v.Type().String())
 			}
-			if !prolog {
-				fmt.Fprintf(os.Stderr, "=-= stack layout for %v:\n", fn)
-				prolog = true
-			}
-			fmt.Fprintf(os.Stderr, " %d: %q frameoff %d used=%v\n", i, v.Sym().Name, v.FrameOffset(), v.Used())
 		}
 	}
 
diff --git a/src/cmd/compile/internal/test/mergelocals_test.go b/src/cmd/compile/internal/test/mergelocals_test.go
index 2c554cf..843044d 100644
--- a/src/cmd/compile/internal/test/mergelocals_test.go
+++ b/src/cmd/compile/internal/test/mergelocals_test.go
@@ -122,22 +122,21 @@
 	// be many possible ways to overlap a given set of candidate
 	// variables, all of them legal. Rather than locking down
 	// a specific set of overlappings or frame offsets, this
-	// tests just verifies that there is one clump of 3 vars that
-	// get overlapped, then another clump of 2 that share the same
-	// frame offset.
+	// tests just verifies that there is a decent-sized clump of 4+ vars that
+	// get overlapped.
 	//
 	// The expected output blob we're interested might look like
 	// this (for amd64):
 	//
 	// =-= stack layout for ABC:
-	// 2: "p1" frameoff -8200 used=true
-	// 3: "xp3" frameoff -8200 used=true
-	// 4: "xp4" frameoff -8200 used=true
-	// 5: "p2" frameoff -16400 used=true
-	// 6: "r" frameoff -16408 used=true
-	// 7: "s" frameoff -24600 used=true
-	// 8: "v2" frameoff -32800 used=true
-	// 9: "v3" frameoff -32800 used=true
+	// 2: "p1" frameoff -8200 ...
+	// 3: "s" frameoff -8200 ...
+	// 4: "v2" frameoff -8200 ...
+	// 5: "v3" frameoff -8200 ...
+	// 6: "xp3" frameoff -8200 ...
+	// 7: "xp4" frameoff -8200 ...
+	// 8: "p2" frameoff -16400 ...
+	// 9: "r" frameoff -16408 ...
 	//
 	tmpdir := t.TempDir()
 	src := filepath.Join("testdata", "mergelocals", "integration.go")
@@ -160,7 +159,7 @@
 			continue
 		}
 		fields := strings.Fields(line)
-		wantFields := 5
+		wantFields := 9
 		if len(fields) != wantFields {
 			t.Logf(string(out))
 			t.Fatalf("bad trace output line, wanted %d fields got %d: %s",
@@ -189,7 +188,7 @@
 	}
 	sort.Ints(got)
 	if n3 == 0 {
-		t.Logf(string(out))
+		t.Logf("%s\n", string(out))
 		t.Fatalf("expected at least one clump of 3, got: %+v", got)
 	}
 }