[release-branch.go1.19] cmd/compile: fix bswap/load rewrite rules

When combining a byteswap and a load, the resulting combined op
must go in the load's block, not the byteswap's block, as the load
has a memory argument that might only be valid in its original block.

Fixes #59974

Change-Id: Icd84863ef3a9ca1fc22f2bb794a003f2808c746f
Reviewed-on: https://go-review.googlesource.com/c/go/+/492616
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Wayne Zuo <wdvxdr@golangcn.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-on: https://go-review.googlesource.com/c/go/+/492697
TryBot-Bypass: Cherry Mui <cherryyz@google.com>
Run-TryBot: Cherry Mui <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index 1c54b8c..74d92a0 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -2275,10 +2275,10 @@
 (BSWAP(Q|L) (BSWAP(Q|L) p)) => p
 
 // CPUID feature: MOVBE.
-(MOV(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem)
-(BSWAP(Q|L) x:(MOV(Q|L)load [i] {s} p mem))    && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)load [i] {s} p mem)
-(BSWAP(Q|L) (MOVBE(Q|L)load [i] {s} p m))    => (MOV(Q|L)load [i] {s} p m)
-(MOVBE(Q|L)store [i] {s} p (BSWAP(Q|L) x) m) => (MOV(Q|L)store [i] {s} p x m)
+(MOV(Q|L)store   [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem)
+(MOVBE(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1                          => (MOV(Q|L)store   [i] {s} p w mem)
+(BSWAP(Q|L) x:(MOV(Q|L)load   [i] {s} p mem))  && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => @x.Block (MOVBE(Q|L)load [i] {s} p mem)
+(BSWAP(Q|L) x:(MOVBE(Q|L)load [i] {s} p mem))  && x.Uses == 1                          => @x.Block (MOV(Q|L)load   [i] {s} p mem)
 (MOVWstore [i] {s} p x:(ROLWconst [8] w) mem)   && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBEWstore [i] {s} p w mem)
 (MOVBEWstore [i] {s} p x:(ROLWconst [8] w) mem) && x.Uses == 1 => (MOVWstore [i] {s} p w mem)
 
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index f085308..cc7f75e 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3721,6 +3721,8 @@
 }
 func rewriteValueAMD64_OpAMD64BSWAPL(v *Value) bool {
 	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
 	// match: (BSWAPL (BSWAPL p))
 	// result: p
 	for {
@@ -3733,7 +3735,7 @@
 	}
 	// match: (BSWAPL x:(MOVLload [i] {s} p mem))
 	// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
-	// result: (MOVBELload [i] {s} p mem)
+	// result: @x.Block (MOVBELload [i] {s} p mem)
 	for {
 		x := v_0
 		if x.Op != OpAMD64MOVLload {
@@ -3746,32 +3748,43 @@
 		if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
 			break
 		}
-		v.reset(OpAMD64MOVBELload)
-		v.AuxInt = int32ToAuxInt(i)
-		v.Aux = symToAux(s)
-		v.AddArg2(p, mem)
+		b = x.Block
+		v0 := b.NewValue0(x.Pos, OpAMD64MOVBELload, typ.UInt32)
+		v.copyOf(v0)
+		v0.AuxInt = int32ToAuxInt(i)
+		v0.Aux = symToAux(s)
+		v0.AddArg2(p, mem)
 		return true
 	}
-	// match: (BSWAPL (MOVBELload [i] {s} p m))
-	// result: (MOVLload [i] {s} p m)
+	// match: (BSWAPL x:(MOVBELload [i] {s} p mem))
+	// cond: x.Uses == 1
+	// result: @x.Block (MOVLload [i] {s} p mem)
 	for {
-		if v_0.Op != OpAMD64MOVBELload {
+		x := v_0
+		if x.Op != OpAMD64MOVBELload {
 			break
 		}
-		i := auxIntToInt32(v_0.AuxInt)
-		s := auxToSym(v_0.Aux)
-		m := v_0.Args[1]
-		p := v_0.Args[0]
-		v.reset(OpAMD64MOVLload)
-		v.AuxInt = int32ToAuxInt(i)
-		v.Aux = symToAux(s)
-		v.AddArg2(p, m)
+		i := auxIntToInt32(x.AuxInt)
+		s := auxToSym(x.Aux)
+		mem := x.Args[1]
+		p := x.Args[0]
+		if !(x.Uses == 1) {
+			break
+		}
+		b = x.Block
+		v0 := b.NewValue0(x.Pos, OpAMD64MOVLload, typ.UInt32)
+		v.copyOf(v0)
+		v0.AuxInt = int32ToAuxInt(i)
+		v0.Aux = symToAux(s)
+		v0.AddArg2(p, mem)
 		return true
 	}
 	return false
 }
 func rewriteValueAMD64_OpAMD64BSWAPQ(v *Value) bool {
 	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
 	// match: (BSWAPQ (BSWAPQ p))
 	// result: p
 	for {
@@ -3784,7 +3797,7 @@
 	}
 	// match: (BSWAPQ x:(MOVQload [i] {s} p mem))
 	// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
-	// result: (MOVBEQload [i] {s} p mem)
+	// result: @x.Block (MOVBEQload [i] {s} p mem)
 	for {
 		x := v_0
 		if x.Op != OpAMD64MOVQload {
@@ -3797,26 +3810,35 @@
 		if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
 			break
 		}
-		v.reset(OpAMD64MOVBEQload)
-		v.AuxInt = int32ToAuxInt(i)
-		v.Aux = symToAux(s)
-		v.AddArg2(p, mem)
+		b = x.Block
+		v0 := b.NewValue0(x.Pos, OpAMD64MOVBEQload, typ.UInt64)
+		v.copyOf(v0)
+		v0.AuxInt = int32ToAuxInt(i)
+		v0.Aux = symToAux(s)
+		v0.AddArg2(p, mem)
 		return true
 	}
-	// match: (BSWAPQ (MOVBEQload [i] {s} p m))
-	// result: (MOVQload [i] {s} p m)
+	// match: (BSWAPQ x:(MOVBEQload [i] {s} p mem))
+	// cond: x.Uses == 1
+	// result: @x.Block (MOVQload [i] {s} p mem)
 	for {
-		if v_0.Op != OpAMD64MOVBEQload {
+		x := v_0
+		if x.Op != OpAMD64MOVBEQload {
 			break
 		}
-		i := auxIntToInt32(v_0.AuxInt)
-		s := auxToSym(v_0.Aux)
-		m := v_0.Args[1]
-		p := v_0.Args[0]
-		v.reset(OpAMD64MOVQload)
-		v.AuxInt = int32ToAuxInt(i)
-		v.Aux = symToAux(s)
-		v.AddArg2(p, m)
+		i := auxIntToInt32(x.AuxInt)
+		s := auxToSym(x.Aux)
+		mem := x.Args[1]
+		p := x.Args[0]
+		if !(x.Uses == 1) {
+			break
+		}
+		b = x.Block
+		v0 := b.NewValue0(x.Pos, OpAMD64MOVQload, typ.UInt64)
+		v.copyOf(v0)
+		v0.AuxInt = int32ToAuxInt(i)
+		v0.Aux = symToAux(s)
+		v0.AddArg2(p, mem)
 		return true
 	}
 	return false
@@ -9652,21 +9674,26 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	// match: (MOVBELstore [i] {s} p (BSWAPL x) m)
-	// result: (MOVLstore [i] {s} p x m)
+	// match: (MOVBELstore [i] {s} p x:(BSWAPL w) mem)
+	// cond: x.Uses == 1
+	// result: (MOVLstore [i] {s} p w mem)
 	for {
 		i := auxIntToInt32(v.AuxInt)
 		s := auxToSym(v.Aux)
 		p := v_0
-		if v_1.Op != OpAMD64BSWAPL {
+		x := v_1
+		if x.Op != OpAMD64BSWAPL {
 			break
 		}
-		x := v_1.Args[0]
-		m := v_2
+		w := x.Args[0]
+		mem := v_2
+		if !(x.Uses == 1) {
+			break
+		}
 		v.reset(OpAMD64MOVLstore)
 		v.AuxInt = int32ToAuxInt(i)
 		v.Aux = symToAux(s)
-		v.AddArg3(p, x, m)
+		v.AddArg3(p, w, mem)
 		return true
 	}
 	return false
@@ -9675,21 +9702,26 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	// match: (MOVBEQstore [i] {s} p (BSWAPQ x) m)
-	// result: (MOVQstore [i] {s} p x m)
+	// match: (MOVBEQstore [i] {s} p x:(BSWAPQ w) mem)
+	// cond: x.Uses == 1
+	// result: (MOVQstore [i] {s} p w mem)
 	for {
 		i := auxIntToInt32(v.AuxInt)
 		s := auxToSym(v.Aux)
 		p := v_0
-		if v_1.Op != OpAMD64BSWAPQ {
+		x := v_1
+		if x.Op != OpAMD64BSWAPQ {
 			break
 		}
-		x := v_1.Args[0]
-		m := v_2
+		w := x.Args[0]
+		mem := v_2
+		if !(x.Uses == 1) {
+			break
+		}
 		v.reset(OpAMD64MOVQstore)
 		v.AuxInt = int32ToAuxInt(i)
 		v.Aux = symToAux(s)
-		v.AddArg3(p, x, m)
+		v.AddArg3(p, w, mem)
 		return true
 	}
 	return false