cmd/compile: better check for single live memory

Enhance the one-live-memory-at-a-time check to run during many
more phases of the SSA backend. Also make it work in an interblock
fashion.

Change types.IsMemory to return true for tuples containing a memory type.

Fix trim pass to build the merged phi correctly. Doesn't affect
code but allows the check to pass after trim runs.

Switch the AddTuple* ops to take the memory-containing tuple argument second.

Update #20335

Change-Id: I5b03ef3606b75a9e4f765276bb8b183cdc172b43
Reviewed-on: https://go-review.googlesource.com/43495
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/ssa/check.go b/src/cmd/compile/internal/ssa/check.go
index 82d7b76..82aa9f1 100644
--- a/src/cmd/compile/internal/ssa/check.go
+++ b/src/cmd/compile/internal/ssa/check.go
@@ -310,6 +310,10 @@
 		}
 	}
 
+	memCheck(f)
+}
+
+func memCheck(f *Func) {
 	// Check that if a tuple has a memory type, it is second.
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
@@ -319,24 +323,122 @@
 		}
 	}
 
-	// Check that only one memory is live at any point.
-	// TODO: make this check examine interblock.
-	if f.scheduled {
-		for _, b := range f.Blocks {
-			var mem *Value // the live memory
+	// Single live memory checks.
+	// These checks only work if there are no memory copies.
+	// (Memory copies introduce ambiguity about which mem value is really live.
+	// probably fixable, but it's easier to avoid the problem.)
+	// For the same reason, disable this check if some memory ops are unused.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if (v.Op == OpCopy || v.Uses == 0) && v.Type.IsMemory() {
+				return
+			}
+		}
+		if b != f.Entry && len(b.Preds) == 0 {
+			return
+		}
+	}
+
+	// Compute live memory at the end of each block.
+	lastmem := make([]*Value, f.NumBlocks())
+	ss := newSparseSet(f.NumValues())
+	for _, b := range f.Blocks {
+		// Mark overwritten memory values. Those are args of other
+		// ops that generate memory values.
+		ss.clear()
+		for _, v := range b.Values {
+			if v.Op == OpPhi || !v.Type.IsMemory() {
+				continue
+			}
+			if m := v.MemoryArg(); m != nil {
+				ss.add(m.ID)
+			}
+		}
+		// There should be at most one remaining unoverwritten memory value.
+		for _, v := range b.Values {
+			if !v.Type.IsMemory() {
+				continue
+			}
+			if ss.contains(v.ID) {
+				continue
+			}
+			if lastmem[b.ID] != nil {
+				f.Fatalf("two live memory values in %s: %s and %s", b, lastmem[b.ID], v)
+			}
+			lastmem[b.ID] = v
+		}
+		// If there is no remaining memory value, that means there was no memory update.
+		// Take any memory arg.
+		if lastmem[b.ID] == nil {
 			for _, v := range b.Values {
-				if v.Op != OpPhi {
-					for _, a := range v.Args {
-						if a.Type.IsMemory() || a.Type.IsTuple() && a.Type.FieldType(1).IsMemory() {
-							if mem == nil {
-								mem = a
-							} else if mem != a {
-								f.Fatalf("two live mems @ %s: %s and %s", v, mem, a)
-							}
-						}
+				if v.Op == OpPhi {
+					continue
+				}
+				m := v.MemoryArg()
+				if m == nil {
+					continue
+				}
+				if lastmem[b.ID] != nil && lastmem[b.ID] != m {
+					f.Fatalf("two live memory values in %s: %s and %s", b, lastmem[b.ID], m)
+				}
+				lastmem[b.ID] = m
+			}
+		}
+	}
+	// Propagate last live memory through storeless blocks.
+	for {
+		changed := false
+		for _, b := range f.Blocks {
+			if lastmem[b.ID] != nil {
+				continue
+			}
+			for _, e := range b.Preds {
+				p := e.b
+				if lastmem[p.ID] != nil {
+					lastmem[b.ID] = lastmem[p.ID]
+					changed = true
+					break
+				}
+			}
+		}
+		if !changed {
+			break
+		}
+	}
+	// Check merge points.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if v.Op == OpPhi && v.Type.IsMemory() {
+				for i, a := range v.Args {
+					if a != lastmem[b.Preds[i].b.ID] {
+						f.Fatalf("inconsistent memory phi %s %d %s %s", v.LongString(), i, a, lastmem[b.Preds[i].b.ID])
 					}
 				}
-				if v.Type.IsMemory() || v.Type.IsTuple() && v.Type.FieldType(1).IsMemory() {
+			}
+		}
+	}
+
+	// Check that only one memory is live at any point.
+	if f.scheduled {
+		for _, b := range f.Blocks {
+			var mem *Value // the current live memory in the block
+			for _, v := range b.Values {
+				if v.Op == OpPhi {
+					if v.Type.IsMemory() {
+						mem = v
+					}
+					continue
+				}
+				if mem == nil && len(b.Preds) > 0 {
+					// If no mem phi, take mem of any predecessor.
+					mem = lastmem[b.Preds[0].b.ID]
+				}
+				for _, a := range v.Args {
+					if a.Type.IsMemory() && a != mem {
+						f.Fatalf("two live mems @ %s: %s and %s", v, mem, a)
+					}
+				}
+				if v.Type.IsMemory() {
 					mem = v
 				}
 			}
diff --git a/src/cmd/compile/internal/ssa/deadstore.go b/src/cmd/compile/internal/ssa/deadstore.go
index de3c6ae..bac4930 100644
--- a/src/cmd/compile/internal/ssa/deadstore.go
+++ b/src/cmd/compile/internal/ssa/deadstore.go
@@ -34,10 +34,6 @@
 			}
 			if v.Type.IsMemory() {
 				stores = append(stores, v)
-				if v.Op == OpSelect1 {
-					// Use the args of the tuple-generating op.
-					v = v.Args[0]
-				}
 				for _, a := range v.Args {
 					if a.Block == b && a.Type.IsMemory() {
 						storeUse.add(a.ID)
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index 5543395..711373a 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -487,12 +487,12 @@
 (AtomicExchange64 ptr val mem) -> (XCHGQ val ptr mem)
 
 // Atomic adds.
-(AtomicAdd32 ptr val mem) -> (AddTupleFirst32 (XADDLlock val ptr mem) val)
-(AtomicAdd64 ptr val mem) -> (AddTupleFirst64 (XADDQlock val ptr mem) val)
-(Select0 <t> (AddTupleFirst32 tuple val)) -> (ADDL val (Select0 <t> tuple))
-(Select1     (AddTupleFirst32 tuple _  )) -> (Select1 tuple)
-(Select0 <t> (AddTupleFirst64 tuple val)) -> (ADDQ val (Select0 <t> tuple))
-(Select1     (AddTupleFirst64 tuple _  )) -> (Select1 tuple)
+(AtomicAdd32 ptr val mem) -> (AddTupleFirst32 val (XADDLlock val ptr mem))
+(AtomicAdd64 ptr val mem) -> (AddTupleFirst64 val (XADDQlock val ptr mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) -> (ADDL val (Select0 <t> tuple))
+(Select1     (AddTupleFirst32   _ tuple)) -> (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) -> (ADDQ val (Select0 <t> tuple))
+(Select1     (AddTupleFirst64   _ tuple)) -> (Select1 tuple)
 
 // Atomic compare and swap.
 (AtomicCompareAndSwap32 ptr old new_ mem) -> (CMPXCHGLlock ptr old new_ mem)
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index ed77bb0..28131db 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -572,8 +572,8 @@
 		// Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)!
 		{name: "XADDLlock", argLength: 3, reg: gpstorexchg, asm: "XADDL", typ: "(UInt32,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
 		{name: "XADDQlock", argLength: 3, reg: gpstorexchg, asm: "XADDQ", typ: "(UInt64,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
-		{name: "AddTupleFirst32", argLength: 2}, // arg0=tuple <x,y>.  Returns <x+arg1,y>.
-		{name: "AddTupleFirst64", argLength: 2}, // arg0=tuple <x,y>.  Returns <x+arg1,y>.
+		{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+		{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
 
 		// Compare and swap.
 		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index aed4f5c..4ae21cd 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -120,12 +120,12 @@
 (AtomicStorePtrNoWB ptr val mem) -> (MOVDatomicstore ptr val mem)
 
 // Atomic adds.
-(AtomicAdd32 ptr val mem) -> (AddTupleFirst32 (LAA ptr val mem) val)
-(AtomicAdd64 ptr val mem) -> (AddTupleFirst64 (LAAG ptr val mem) val)
-(Select0 <t> (AddTupleFirst32 tuple val)) -> (ADDW val (Select0 <t> tuple))
-(Select1     (AddTupleFirst32 tuple _  )) -> (Select1 tuple)
-(Select0 <t> (AddTupleFirst64 tuple val)) -> (ADD val (Select0 <t> tuple))
-(Select1     (AddTupleFirst64 tuple _  )) -> (Select1 tuple)
+(AtomicAdd32 ptr val mem) -> (AddTupleFirst32 val (LAA ptr val mem))
+(AtomicAdd64 ptr val mem) -> (AddTupleFirst64 val (LAAG ptr val mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) -> (ADDW val (Select0 <t> tuple))
+(Select1     (AddTupleFirst32   _ tuple)) -> (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) -> (ADD val (Select0 <t> tuple))
+(Select1     (AddTupleFirst64   _ tuple)) -> (Select1 tuple)
 
 // Atomic exchanges.
 (AtomicExchange32 ptr val mem) -> (LoweredAtomicExchange32 ptr val mem)
diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go
index c3edb93..2a08a27 100644
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -467,8 +467,8 @@
 		// Returns a tuple of <old contents of *(arg0+auxint+aux), memory>.
 		{name: "LAA", argLength: 3, reg: gpstorelaa, asm: "LAA", typ: "(UInt32,Mem)", aux: "SymOff", faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
 		{name: "LAAG", argLength: 3, reg: gpstorelaa, asm: "LAAG", typ: "(UInt64,Mem)", aux: "SymOff", faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
-		{name: "AddTupleFirst32", argLength: 2}, // arg0=tuple <x,y>.  Returns <x+arg1,y>.
-		{name: "AddTupleFirst64", argLength: 2}, // arg0=tuple <x,y>.  Returns <x+arg1,y>.
+		{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+		{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
 
 		// Compare and swap.
 		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
diff --git a/src/cmd/compile/internal/ssa/loopreschedchecks.go b/src/cmd/compile/internal/ssa/loopreschedchecks.go
index 8ffca82..98b6e92 100644
--- a/src/cmd/compile/internal/ssa/loopreschedchecks.go
+++ b/src/cmd/compile/internal/ssa/loopreschedchecks.go
@@ -391,10 +391,6 @@
 			}
 			if v.Type.IsMemory() {
 				stores = append(stores, v)
-				if v.Op == OpSelect1 {
-					// Use the arg of the tuple-generating op.
-					v = v.Args[0]
-				}
 				for _, a := range v.Args {
 					if a.Block == b && a.Type.IsMemory() {
 						storeUse.add(a.ID)
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index ff24103..9a69a31 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -35902,18 +35902,18 @@
 	_ = typ
 	// match: (AtomicAdd32 ptr val mem)
 	// cond:
-	// result: (AddTupleFirst32 (XADDLlock val ptr mem) val)
+	// result: (AddTupleFirst32 val (XADDLlock val ptr mem))
 	for {
 		ptr := v.Args[0]
 		val := v.Args[1]
 		mem := v.Args[2]
 		v.reset(OpAMD64AddTupleFirst32)
+		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpAMD64XADDLlock, types.NewTuple(typ.UInt32, types.TypeMem))
 		v0.AddArg(val)
 		v0.AddArg(ptr)
 		v0.AddArg(mem)
 		v.AddArg(v0)
-		v.AddArg(val)
 		return true
 	}
 }
@@ -35924,18 +35924,18 @@
 	_ = typ
 	// match: (AtomicAdd64 ptr val mem)
 	// cond:
-	// result: (AddTupleFirst64 (XADDQlock val ptr mem) val)
+	// result: (AddTupleFirst64 val (XADDQlock val ptr mem))
 	for {
 		ptr := v.Args[0]
 		val := v.Args[1]
 		mem := v.Args[2]
 		v.reset(OpAMD64AddTupleFirst64)
+		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpAMD64XADDQlock, types.NewTuple(typ.UInt64, types.TypeMem))
 		v0.AddArg(val)
 		v0.AddArg(ptr)
 		v0.AddArg(mem)
 		v.AddArg(v0)
-		v.AddArg(val)
 		return true
 	}
 }
@@ -40216,7 +40216,7 @@
 func rewriteValueAMD64_OpSelect0_0(v *Value) bool {
 	b := v.Block
 	_ = b
-	// match: (Select0 <t> (AddTupleFirst32 tuple val))
+	// match: (Select0 <t> (AddTupleFirst32 val tuple))
 	// cond:
 	// result: (ADDL val (Select0 <t> tuple))
 	for {
@@ -40225,8 +40225,8 @@
 		if v_0.Op != OpAMD64AddTupleFirst32 {
 			break
 		}
-		tuple := v_0.Args[0]
-		val := v_0.Args[1]
+		val := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpAMD64ADDL)
 		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpSelect0, t)
@@ -40234,7 +40234,7 @@
 		v.AddArg(v0)
 		return true
 	}
-	// match: (Select0 <t> (AddTupleFirst64 tuple val))
+	// match: (Select0 <t> (AddTupleFirst64 val tuple))
 	// cond:
 	// result: (ADDQ val (Select0 <t> tuple))
 	for {
@@ -40243,8 +40243,8 @@
 		if v_0.Op != OpAMD64AddTupleFirst64 {
 			break
 		}
-		tuple := v_0.Args[0]
-		val := v_0.Args[1]
+		val := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpAMD64ADDQ)
 		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpSelect0, t)
@@ -40255,7 +40255,7 @@
 	return false
 }
 func rewriteValueAMD64_OpSelect1_0(v *Value) bool {
-	// match: (Select1 (AddTupleFirst32 tuple _))
+	// match: (Select1 (AddTupleFirst32 _ tuple))
 	// cond:
 	// result: (Select1 tuple)
 	for {
@@ -40263,12 +40263,12 @@
 		if v_0.Op != OpAMD64AddTupleFirst32 {
 			break
 		}
-		tuple := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpSelect1)
 		v.AddArg(tuple)
 		return true
 	}
-	// match: (Select1 (AddTupleFirst64 tuple _))
+	// match: (Select1 (AddTupleFirst64 _ tuple))
 	// cond:
 	// result: (Select1 tuple)
 	for {
@@ -40276,7 +40276,7 @@
 		if v_0.Op != OpAMD64AddTupleFirst64 {
 			break
 		}
-		tuple := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpSelect1)
 		v.AddArg(tuple)
 		return true
diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
index 1929f54..f07ca9a 100644
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@@ -898,18 +898,18 @@
 	_ = typ
 	// match: (AtomicAdd32 ptr val mem)
 	// cond:
-	// result: (AddTupleFirst32 (LAA ptr val mem) val)
+	// result: (AddTupleFirst32 val (LAA ptr val mem))
 	for {
 		ptr := v.Args[0]
 		val := v.Args[1]
 		mem := v.Args[2]
 		v.reset(OpS390XAddTupleFirst32)
+		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpS390XLAA, types.NewTuple(typ.UInt32, types.TypeMem))
 		v0.AddArg(ptr)
 		v0.AddArg(val)
 		v0.AddArg(mem)
 		v.AddArg(v0)
-		v.AddArg(val)
 		return true
 	}
 }
@@ -920,18 +920,18 @@
 	_ = typ
 	// match: (AtomicAdd64 ptr val mem)
 	// cond:
-	// result: (AddTupleFirst64 (LAAG ptr val mem) val)
+	// result: (AddTupleFirst64 val (LAAG ptr val mem))
 	for {
 		ptr := v.Args[0]
 		val := v.Args[1]
 		mem := v.Args[2]
 		v.reset(OpS390XAddTupleFirst64)
+		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpS390XLAAG, types.NewTuple(typ.UInt64, types.TypeMem))
 		v0.AddArg(ptr)
 		v0.AddArg(val)
 		v0.AddArg(mem)
 		v.AddArg(v0)
-		v.AddArg(val)
 		return true
 	}
 }
@@ -34159,7 +34159,7 @@
 func rewriteValueS390X_OpSelect0_0(v *Value) bool {
 	b := v.Block
 	_ = b
-	// match: (Select0 <t> (AddTupleFirst32 tuple val))
+	// match: (Select0 <t> (AddTupleFirst32 val tuple))
 	// cond:
 	// result: (ADDW val (Select0 <t> tuple))
 	for {
@@ -34168,8 +34168,8 @@
 		if v_0.Op != OpS390XAddTupleFirst32 {
 			break
 		}
-		tuple := v_0.Args[0]
-		val := v_0.Args[1]
+		val := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpS390XADDW)
 		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpSelect0, t)
@@ -34177,7 +34177,7 @@
 		v.AddArg(v0)
 		return true
 	}
-	// match: (Select0 <t> (AddTupleFirst64 tuple val))
+	// match: (Select0 <t> (AddTupleFirst64 val tuple))
 	// cond:
 	// result: (ADD val (Select0 <t> tuple))
 	for {
@@ -34186,8 +34186,8 @@
 		if v_0.Op != OpS390XAddTupleFirst64 {
 			break
 		}
-		tuple := v_0.Args[0]
-		val := v_0.Args[1]
+		val := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpS390XADD)
 		v.AddArg(val)
 		v0 := b.NewValue0(v.Pos, OpSelect0, t)
@@ -34198,7 +34198,7 @@
 	return false
 }
 func rewriteValueS390X_OpSelect1_0(v *Value) bool {
-	// match: (Select1 (AddTupleFirst32 tuple _))
+	// match: (Select1 (AddTupleFirst32 _ tuple))
 	// cond:
 	// result: (Select1 tuple)
 	for {
@@ -34206,12 +34206,12 @@
 		if v_0.Op != OpS390XAddTupleFirst32 {
 			break
 		}
-		tuple := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpSelect1)
 		v.AddArg(tuple)
 		return true
 	}
-	// match: (Select1 (AddTupleFirst64 tuple _))
+	// match: (Select1 (AddTupleFirst64 _ tuple))
 	// cond:
 	// result: (Select1 tuple)
 	for {
@@ -34219,7 +34219,7 @@
 		if v_0.Op != OpS390XAddTupleFirst64 {
 			break
 		}
-		tuple := v_0.Args[0]
+		tuple := v_0.Args[1]
 		v.reset(OpSelect1)
 		v.AddArg(tuple)
 		return true
diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go
index 2e9464e..c44c243 100644
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -132,19 +132,14 @@
 		}
 	}
 
-	// TODO: make this logic permanent in types.IsMemory?
-	isMem := func(v *Value) bool {
-		return v.Type.IsMemory() || v.Type.IsTuple() && v.Type.FieldType(1).IsMemory()
-	}
-
 	for _, b := range f.Blocks {
 		// Find store chain for block.
 		// Store chains for different blocks overwrite each other, so
 		// the calculated store chain is good only for this block.
 		for _, v := range b.Values {
-			if v.Op != OpPhi && isMem(v) {
+			if v.Op != OpPhi && v.Type.IsMemory() {
 				for _, w := range v.Args {
-					if isMem(w) {
+					if w.Type.IsMemory() {
 						nextMem[w.ID] = v
 					}
 				}
@@ -164,7 +159,7 @@
 					uses[w.ID]++
 				}
 				// Any load must come before the following store.
-				if !isMem(v) && isMem(w) {
+				if !v.Type.IsMemory() && w.Type.IsMemory() {
 					// v is a load.
 					s := nextMem[w.ID]
 					if s == nil || s.Block != b {
@@ -315,11 +310,7 @@
 			if v.Op == OpInitMem || v.Op == OpPhi {
 				continue
 			}
-			a := v
-			if v.Op == OpSelect1 {
-				a = a.Args[0]
-			}
-			sset.add(a.MemoryArg().ID) // record that v's memory arg is used
+			sset.add(v.MemoryArg().ID) // record that v's memory arg is used
 		}
 		if v.Op == OpNilCheck {
 			hasNilCheck = true
@@ -335,7 +326,7 @@
 	for _, v := range stores {
 		if !sset.contains(v.ID) {
 			if last != nil {
-				f.Fatalf("two stores live simutaneously: %v and %v", v, last)
+				f.Fatalf("two stores live simultaneously: %v and %v", v, last)
 			}
 			last = v
 		}
@@ -362,9 +353,6 @@
 			}
 			break
 		}
-		if w.Op == OpSelect1 {
-			w = w.Args[0]
-		}
 		w = w.MemoryArg()
 	}
 	var stack []*Value
diff --git a/src/cmd/compile/internal/ssa/trim.go b/src/cmd/compile/internal/ssa/trim.go
index 09e80bd..04b4fd4 100644
--- a/src/cmd/compile/internal/ssa/trim.go
+++ b/src/cmd/compile/internal/ssa/trim.go
@@ -46,10 +46,24 @@
 						v.resetArgs()
 						continue
 					}
-					// Pad the arguments of the remaining phi-ops, so
+					// Pad the arguments of the remaining phi-ops so
 					// they match the new predecessor count of `s`.
-					for len(v.Args) < len(s.Preds) {
-						v.AddArg(v.Args[0])
+					// Since s did not have a Phi op corresponding to
+					// the phi op in b, the other edges coming into s
+					// must be loopback edges from s, so v is the right
+					// argument to v!
+					args := make([]*Value, len(v.Args))
+					copy(args, v.Args)
+					v.resetArgs()
+					for x := 0; x < j; x++ {
+						v.AddArg(v)
+					}
+					v.AddArg(args[0])
+					for x := j + 1; x < ns; x++ {
+						v.AddArg(v)
+					}
+					for _, a := range args[1:] {
+						v.AddArg(a)
 					}
 				}
 				b.Values[k] = v
diff --git a/src/cmd/compile/internal/ssa/value.go b/src/cmd/compile/internal/ssa/value.go
index a0ba112..7edc71b 100644
--- a/src/cmd/compile/internal/ssa/value.go
+++ b/src/cmd/compile/internal/ssa/value.go
@@ -319,10 +319,8 @@
 }
 
 // MemoryArg returns the memory argument for the Value.
-// The returned value, if non-nil, will be memory-typed,
-// except in the case where v is Select1, in which case
-// the returned value will be a tuple containing a memory
-// type. Otherwise, nil is returned.
+// The returned value, if non-nil, will be memory-typed (or a tuple with a memory-typed second part).
+// Otherwise, nil is returned.
 func (v *Value) MemoryArg() *Value {
 	if v.Op == OpPhi {
 		v.Fatalf("MemoryArg on Phi")
@@ -331,8 +329,7 @@
 	if na == 0 {
 		return nil
 	}
-	if m := v.Args[na-1]; m.Type.IsMemory() ||
-		(v.Op == OpSelect1 && m.Type.FieldType(1).IsMemory()) {
+	if m := v.Args[na-1]; m.Type.IsMemory() {
 		return m
 	}
 	return nil
diff --git a/src/cmd/compile/internal/types/type.go b/src/cmd/compile/internal/types/type.go
index 6f2f574..5c44e62 100644
--- a/src/cmd/compile/internal/types/type.go
+++ b/src/cmd/compile/internal/types/type.go
@@ -1324,10 +1324,12 @@
 	return t.Extra.(*Chan).Dir
 }
 
-func (t *Type) IsMemory() bool { return t == TypeMem }
-func (t *Type) IsFlags() bool  { return t == TypeFlags }
-func (t *Type) IsVoid() bool   { return t == TypeVoid }
-func (t *Type) IsTuple() bool  { return t.Etype == TTUPLE }
+func (t *Type) IsMemory() bool {
+	return t == TypeMem || t.Etype == TTUPLE && t.Extra.(*Tuple).second == TypeMem
+}
+func (t *Type) IsFlags() bool { return t == TypeFlags }
+func (t *Type) IsVoid() bool  { return t == TypeVoid }
+func (t *Type) IsTuple() bool { return t.Etype == TTUPLE }
 
 // IsUntyped reports whether t is an untyped type.
 func (t *Type) IsUntyped() bool {