cmd/compile: fix ephemeral pointer problem on amd64

Make sure we don't use the rewrite ptr + (c + x) -> c + (ptr + x), as
that may create an ephemeral out-of-bounds pointer.

I have not seen an actual bug caused by this yet, but we've seen
them in the 386 port so I'm fixing this issue for amd64 as well.

The load-combining rules needed to be reworked somewhat to still
work without the above broken rule.

Update #37881

Change-Id: I8046d170e89e2035195f261535e34ca7d8aca68a
Reviewed-on: https://go-review.googlesource.com/c/go/+/226437
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index ca5962f..b5133d6 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -1491,65 +1491,70 @@
 
 // Little-endian loads
 
-(ORL                  x0:(MOVBload [i0] {s} p0 mem)
-    sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p1 mem)))
+(OR(L|Q)                  x0:(MOVBload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i1] {s} p mem)))
   && i1 == i0+1
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
 
-(ORQ                  x0:(MOVBload [i0] {s} p0 mem)
-    sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-  && i1 == i0+1
+(OR(L|Q)                  x0:(MOVBload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i] {s} p1 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
 
-(ORL                   x0:(MOVWload [i0] {s} p0 mem)
-    sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p1 mem)))
+(OR(L|Q)                   x0:(MOVWload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i1] {s} p mem)))
   && i1 == i0+2
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
 
-(ORQ                   x0:(MOVWload [i0] {s} p0 mem)
-    sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-  && i1 == i0+2
+(OR(L|Q)                   x0:(MOVWload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i] {s} p1 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 2)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
 
-(ORQ                   x0:(MOVLload [i0] {s} p0 mem)
-    sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p1 mem)))
+(ORQ                   x0:(MOVLload [i0] {s} p mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
   && i1 == i0+4
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVQload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
 
-(ORL
-    s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p0 mem))
-    or:(ORL
-        s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p1 mem))
+(ORQ                   x0:(MOVLload [i] {s} p0 mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  -> @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
 	y))
   && i1 == i0+1
   && j1 == j0+8
@@ -1559,17 +1564,15 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
 
-(ORQ
-    s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p0 mem))
-    or:(ORQ
-        s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p1 mem))
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
 	y))
-  && i1 == i0+1
   && j1 == j0+8
   && j0 % 16 == 0
   && x0.Uses == 1
@@ -1577,15 +1580,15 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
 
 (ORQ
-    s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p0 mem))
+    s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem))
     or:(ORQ
-        s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p1 mem))
+        s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem))
 	y))
   && i1 == i0+2
   && j1 == j0+16
@@ -1595,106 +1598,107 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p0 mem)) y)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
 
-// Little-endian indexed loads
-
-// Move constants offsets from LEAQx up into load. This lets the above combining
-// rules discover indexed load-combining instances.
-//TODO:remove! These rules are bad.
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
+(ORQ
+    s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem))
+    or:(ORQ
+        s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem))
+	y))
+  && j1 == j0+16
+  && j0 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
 
 // Big-endian loads
 
-(ORL
-                       x1:(MOVBload [i1] {s} p0 mem)
-    sh:(SHLLconst [8]  x0:(MOVBload [i0] {s} p1 mem)))
+(OR(L|Q)
+                           x1:(MOVBload [i1] {s} p mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i0] {s} p mem)))
   && i1 == i0+1
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
 
-(ORQ
-                       x1:(MOVBload [i1] {s} p0 mem)
-    sh:(SHLQconst [8]  x0:(MOVBload [i0] {s} p1 mem)))
-  && i1 == i0+1
+(OR(L|Q)
+                           x1:(MOVBload [i] {s} p1 mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i] {s} p0 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
 
-(ORL
-                        r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem))
-    sh:(SHLLconst [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
   && i1 == i0+2
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
 
-(ORQ
-                        r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem))
-    sh:(SHLQconst [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-  && i1 == i0+2
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 2)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
 
 (ORQ
-                        r1:(BSWAPL x1:(MOVLload [i1] {s} p0 mem))
-    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i0] {s} p1 mem))))
+                        r1:(BSWAPL x1:(MOVLload [i1] {s} p mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
   && i1 == i0+4
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
 
-(ORL
-    s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p0 mem))
-    or:(ORL
-        s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p1 mem))
+(ORQ
+                        r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
+
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
 	y))
   && i1 == i0+1
   && j1 == j0-8
@@ -1704,17 +1708,15 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
 
-(ORQ
-    s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p0 mem))
-    or:(ORQ
-        s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p1 mem))
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
 	y))
-  && i1 == i0+1
   && j1 == j0-8
   && j1 % 16 == 0
   && x0.Uses == 1
@@ -1722,15 +1724,15 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
 
 (ORQ
-    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p0 mem)))
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem)))
     or:(ORQ
-        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p1 mem)))
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)))
 	y))
   && i1 == i0+2
   && j1 == j0-16
@@ -1742,41 +1744,73 @@
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, r0, r1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
+
+(ORQ
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem)))
+    or:(ORQ
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)))
+	y))
+  && j1 == j0-16
+  && j1 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, r0, r1, s0, s1, or)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
 
 // Combine 2 byte stores + shift into rolw 8 + word store
-(MOVBstore [i] {s} p1 w
-  x0:(MOVBstore [i-1] {s} p0 (SHRWconst [8] w) mem))
+(MOVBstore [i] {s} p w
+  x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
   && x0.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x0)
-  -> (MOVWstore [i-1] {s} p0 (ROLWconst <w.Type> [8] w) mem)
+  -> (MOVWstore [i-1] {s} p (ROLWconst <w.Type> [8] w) mem)
+(MOVBstore [i] {s} p1 w
+  x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
+  && x0.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x0)
+  -> (MOVWstore [i] {s} p0 (ROLWconst <w.Type> [8] w) mem)
 
 // Combine stores + shifts into bswap and larger (unaligned) stores
-(MOVBstore [i] {s} p3 w
-  x2:(MOVBstore [i-1] {s} p2 (SHRLconst [8] w)
-  x1:(MOVBstore [i-2] {s} p1 (SHRLconst [16] w)
-  x0:(MOVBstore [i-3] {s} p0 (SHRLconst [24] w) mem))))
+(MOVBstore [i] {s} p w
+  x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
+  x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
+  x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
   && x0.Uses == 1
   && x1.Uses == 1
   && x2.Uses == 1
-  && same(p0, p1, 1)
-  && same(p1, p2, 1)
-  && same(p2, p3, 1)
   && clobber(x0, x1, x2)
-  -> (MOVLstore [i-3] {s} p0 (BSWAPL <w.Type> w) mem)
+  -> (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
+(MOVBstore [i] {s} p3 w
+  x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w)
+  x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w)
+  x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
+  && clobber(x0, x1, x2)
+  -> (MOVLstore [i] {s} p0 (BSWAPL <w.Type> w) mem)
 
-(MOVBstore [i] {s} p7 w
-  x6:(MOVBstore [i-1] {s} p6 (SHRQconst [8] w)
-  x5:(MOVBstore [i-2] {s} p5 (SHRQconst [16] w)
-  x4:(MOVBstore [i-3] {s} p4 (SHRQconst [24] w)
-  x3:(MOVBstore [i-4] {s} p3 (SHRQconst [32] w)
-  x2:(MOVBstore [i-5] {s} p2 (SHRQconst [40] w)
-  x1:(MOVBstore [i-6] {s} p1 (SHRQconst [48] w)
-  x0:(MOVBstore [i-7] {s} p0 (SHRQconst [56] w) mem))))))))
+(MOVBstore [i] {s} p w
+  x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
+  x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
+  x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
+  x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
+  x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
+  x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
+  x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
   && x0.Uses == 1
   && x1.Uses == 1
   && x2.Uses == 1
@@ -1784,99 +1818,139 @@
   && x4.Uses == 1
   && x5.Uses == 1
   && x6.Uses == 1
-  && same(p0, p1, 1)
-  && same(p1, p2, 1)
-  && same(p2, p3, 1)
-  && same(p3, p4, 1)
-  && same(p4, p5, 1)
-  && same(p5, p6, 1)
-  && same(p6, p7, 1)
   && clobber(x0, x1, x2, x3, x4, x5, x6)
-  -> (MOVQstore [i-7] {s} p0 (BSWAPQ <w.Type> w) mem)
+  -> (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
+(MOVBstore [i] {s} p7 w
+  x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w)
+  x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w)
+  x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w)
+  x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w)
+  x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w)
+  x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w)
+  x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
+  && sequentialAddresses(p3, p4, 1)
+  && sequentialAddresses(p4, p5, 1)
+  && sequentialAddresses(p5, p6, 1)
+  && sequentialAddresses(p6, p7, 1)
+  && clobber(x0, x1, x2, x3, x4, x5, x6)
+  -> (MOVQstore [i] {s} p0 (BSWAPQ <w.Type> w) mem)
 
 // Combine constant stores into larger (unaligned) stores.
-(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
-(MOVBstoreconst [a] {s} p1 x:(MOVBstoreconst [c] {s} p0 mem))
+  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
+(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
-(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
+  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
-(MOVWstoreconst [a] {s} p1 x:(MOVWstoreconst [c] {s} p0 mem))
+  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
+(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
-(MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem))
+  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
+(MOVLstoreconst [c] {s} p x:(MOVLstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
-(MOVLstoreconst [a] {s} p1 x:(MOVLstoreconst [c] {s} p0 mem))
+  -> (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+(MOVLstoreconst [a] {s} p x:(MOVLstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
-(MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [c2] {s} p0 mem))
+  -> (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+(MOVQstoreconst [c] {s} p x:(MOVQstoreconst [c2] {s} p mem))
   && config.useSSE
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off()
   && ValAndOff(c).Val() == 0
   && ValAndOff(c2).Val() == 0
   && clobber(x)
-  -> (MOVOstore [ValAndOff(c2).Off()] {s} p0 (MOVOconst [0]) mem)
+  -> (MOVOstore [ValAndOff(c2).Off()] {s} p (MOVOconst [0]) mem)
 
-// Combine stores into larger (unaligned) stores.
-(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
+// Combine stores into larger (unaligned) stores. Little endian.
+(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVWstore [i-1] {s} p0 w mem)
-(MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHR(W|L|Q)const [8] w) mem))
+  -> (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHR(W|L|Q)const [8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && clobber(x)
+  -> (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVWstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x)
   -> (MOVWstore [i] {s} p0 w mem)
-(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
+(MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x)
-  -> (MOVWstore [i-1] {s} p0 w0 mem)
-(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
+  -> (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x)
-  -> (MOVLstore [i-2] {s} p0 w mem)
-(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+  -> (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVLstore [i-2] {s} p0 w0 mem)
-(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p0 w mem))
+  -> (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVQstore [i-4] {s} p0 w mem)
-(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p0 w0:(SHRQconst [j-32] w) mem))
+  -> (MOVLstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i] {s} p0 w mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 2)
   && clobber(x)
-  -> (MOVQstore [i-4] {s} p0 w0 mem)
+  -> (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  -> (MOVLstore [i] {s} p0 w0 mem)
+
+(MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVQstore [i-4] {s} p w mem)
+(MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVQstore [i-4] {s} p w0 mem)
+(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && clobber(x)
+  -> (MOVQstore [i] {s} p0 w mem)
+(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && clobber(x)
+  -> (MOVQstore [i] {s} p0 w0 mem)
 
 (MOVBstore [i] {s} p
   x1:(MOVBload [j] {s2} p2 mem)
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index fc03f0d..878b15e 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1260,46 +1260,15 @@
 			x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
 		return true
 	}
+	if x.Op == OpAMD64ADDQ && y.Op == OpAMD64LEAQ1 && y.AuxInt == n && y.Aux == nil &&
+		(x.Args[0] == y.Args[0] && x.Args[1] == y.Args[1] ||
+			x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
+		return true
+	}
+	if x.Op == OpAMD64LEAQ1 && y.Op == OpAMD64LEAQ1 && y.AuxInt == x.AuxInt+n && x.Aux == y.Aux &&
+		(x.Args[0] == y.Args[0] && x.Args[1] == y.Args[1] ||
+			x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
+		return true
+	}
 	return false
 }
-
-// same reports whether x and y are the same value.
-// It checks to a maximum depth of d, so it may report
-// a false negative.
-// TODO: remove when amd64 port is switched to using sequentialAddresses
-func same(x, y *Value, depth int) bool {
-	if x == y {
-		return true
-	}
-	if depth <= 0 {
-		return false
-	}
-	if x.Op != y.Op || x.Aux != y.Aux || x.AuxInt != y.AuxInt {
-		return false
-	}
-	if len(x.Args) != len(y.Args) {
-		return false
-	}
-	if opcodeTable[x.Op].commutative {
-		// Check exchanged ordering first.
-		for i, a := range x.Args {
-			j := i
-			if j < 2 {
-				j ^= 1
-			}
-			b := y.Args[j]
-			if !same(a, b, depth-1) {
-				goto checkNormalOrder
-			}
-		}
-		return true
-	checkNormalOrder:
-	}
-	for i, a := range x.Args {
-		b := y.Args[i]
-		if !same(a, b, depth-1) {
-			return false
-		}
-	}
-	return true
-}
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index b9a401c..e4d8648 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -10140,7 +10140,6 @@
 func rewriteValueAMD64_OpAMD64MOVBload(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
 	// match: (MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _))
 	// cond: sym == sym2 && off == off2 && isSamePtr(ptr, ptr2)
 	// result: (MOVBQZX x)
@@ -10205,118 +10204,6 @@
 		v.AddArg2(base, mem)
 		return true
 	}
-	// match: (MOVBload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVBload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVBload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVBload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
 	// match: (MOVBload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
 	// cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
 	// result: (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -10722,138 +10609,24 @@
 		v.AddArg3(base, val, mem)
 		return true
 	}
-	// match: (MOVBstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVBstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVBstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVBstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVBstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVBstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVBstore [i] {s} p1 w x0:(MOVBstore [i-1] {s} p0 (SHRWconst [8] w) mem))
-	// cond: x0.Uses == 1 && same(p0, p1, 1) && clobber(x0)
-	// result: (MOVWstore [i-1] {s} p0 (ROLWconst <w.Type> [8] w) mem)
+	// match: (MOVBstore [i] {s} p w x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
+	// cond: x0.Uses == 1 && clobber(x0)
+	// result: (MOVWstore [i-1] {s} p (ROLWconst <w.Type> [8] w) mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		w := v_1
 		x0 := v_2
 		if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i-1 || x0.Aux != s {
 			break
 		}
 		mem := x0.Args[2]
-		p0 := x0.Args[0]
+		if p != x0.Args[0] {
+			break
+		}
 		x0_1 := x0.Args[1]
-		if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && same(p0, p1, 1) && clobber(x0)) {
+		if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && clobber(x0)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
@@ -10862,23 +10635,52 @@
 		v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, w.Type)
 		v0.AuxInt = 8
 		v0.AddArg(w)
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p3 w x2:(MOVBstore [i-1] {s} p2 (SHRLconst [8] w) x1:(MOVBstore [i-2] {s} p1 (SHRLconst [16] w) x0:(MOVBstore [i-3] {s} p0 (SHRLconst [24] w) mem))))
-	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && clobber(x0, x1, x2)
-	// result: (MOVLstore [i-3] {s} p0 (BSWAPL <w.Type> w) mem)
+	// match: (MOVBstore [i] {s} p1 w x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
+	// cond: x0.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x0)
+	// result: (MOVWstore [i] {s} p0 (ROLWconst <w.Type> [8] w) mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p3 := v_0
+		p1 := v_0
+		w := v_1
+		x0 := v_2
+		if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
+			break
+		}
+		mem := x0.Args[2]
+		p0 := x0.Args[0]
+		x0_1 := x0.Args[1]
+		if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x0)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, w.Type)
+		v0.AuxInt = 8
+		v0.AddArg(w)
+		v.AddArg3(p0, v0, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p w x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w) x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w) x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0, x1, x2)
+	// result: (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p := v_0
 		w := v_1
 		x2 := v_2
 		if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i-1 || x2.Aux != s {
 			break
 		}
 		_ = x2.Args[2]
-		p2 := x2.Args[0]
+		if p != x2.Args[0] {
+			break
+		}
 		x2_1 := x2.Args[1]
 		if x2_1.Op != OpAMD64SHRLconst || x2_1.AuxInt != 8 || w != x2_1.Args[0] {
 			break
@@ -10888,7 +10690,9 @@
 			break
 		}
 		_ = x1.Args[2]
-		p1 := x1.Args[0]
+		if p != x1.Args[0] {
+			break
+		}
 		x1_1 := x1.Args[1]
 		if x1_1.Op != OpAMD64SHRLconst || x1_1.AuxInt != 16 || w != x1_1.Args[0] {
 			break
@@ -10898,9 +10702,11 @@
 			break
 		}
 		mem := x0.Args[2]
-		p0 := x0.Args[0]
+		if p != x0.Args[0] {
+			break
+		}
 		x0_1 := x0.Args[1]
-		if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && clobber(x0, x1, x2)) {
+		if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0, x1, x2)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstore)
@@ -10908,23 +10714,71 @@
 		v.Aux = s
 		v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, w.Type)
 		v0.AddArg(w)
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p7 w x6:(MOVBstore [i-1] {s} p6 (SHRQconst [8] w) x5:(MOVBstore [i-2] {s} p5 (SHRQconst [16] w) x4:(MOVBstore [i-3] {s} p4 (SHRQconst [24] w) x3:(MOVBstore [i-4] {s} p3 (SHRQconst [32] w) x2:(MOVBstore [i-5] {s} p2 (SHRQconst [40] w) x1:(MOVBstore [i-6] {s} p1 (SHRQconst [48] w) x0:(MOVBstore [i-7] {s} p0 (SHRQconst [56] w) mem))))))))
-	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && same(p3, p4, 1) && same(p4, p5, 1) && same(p5, p6, 1) && same(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)
-	// result: (MOVQstore [i-7] {s} p0 (BSWAPQ <w.Type> w) mem)
+	// match: (MOVBstore [i] {s} p3 w x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w) x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w) x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && clobber(x0, x1, x2)
+	// result: (MOVLstore [i] {s} p0 (BSWAPL <w.Type> w) mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p7 := v_0
+		p3 := v_0
+		w := v_1
+		x2 := v_2
+		if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i || x2.Aux != s {
+			break
+		}
+		_ = x2.Args[2]
+		p2 := x2.Args[0]
+		x2_1 := x2.Args[1]
+		if x2_1.Op != OpAMD64SHRLconst || x2_1.AuxInt != 8 || w != x2_1.Args[0] {
+			break
+		}
+		x1 := x2.Args[2]
+		if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i || x1.Aux != s {
+			break
+		}
+		_ = x1.Args[2]
+		p1 := x1.Args[0]
+		x1_1 := x1.Args[1]
+		if x1_1.Op != OpAMD64SHRLconst || x1_1.AuxInt != 16 || w != x1_1.Args[0] {
+			break
+		}
+		x0 := x1.Args[2]
+		if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
+			break
+		}
+		mem := x0.Args[2]
+		p0 := x0.Args[0]
+		x0_1 := x0.Args[1]
+		if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && clobber(x0, x1, x2)) {
+			break
+		}
+		v.reset(OpAMD64MOVLstore)
+		v.AuxInt = i
+		v.Aux = s
+		v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, w.Type)
+		v0.AddArg(w)
+		v.AddArg3(p0, v0, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p w x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w) x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w) x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w) x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w) x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w) x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w) x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0, x1, x2, x3, x4, x5, x6)
+	// result: (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p := v_0
 		w := v_1
 		x6 := v_2
 		if x6.Op != OpAMD64MOVBstore || x6.AuxInt != i-1 || x6.Aux != s {
 			break
 		}
 		_ = x6.Args[2]
-		p6 := x6.Args[0]
+		if p != x6.Args[0] {
+			break
+		}
 		x6_1 := x6.Args[1]
 		if x6_1.Op != OpAMD64SHRQconst || x6_1.AuxInt != 8 || w != x6_1.Args[0] {
 			break
@@ -10934,7 +10788,9 @@
 			break
 		}
 		_ = x5.Args[2]
-		p5 := x5.Args[0]
+		if p != x5.Args[0] {
+			break
+		}
 		x5_1 := x5.Args[1]
 		if x5_1.Op != OpAMD64SHRQconst || x5_1.AuxInt != 16 || w != x5_1.Args[0] {
 			break
@@ -10944,7 +10800,9 @@
 			break
 		}
 		_ = x4.Args[2]
-		p4 := x4.Args[0]
+		if p != x4.Args[0] {
+			break
+		}
 		x4_1 := x4.Args[1]
 		if x4_1.Op != OpAMD64SHRQconst || x4_1.AuxInt != 24 || w != x4_1.Args[0] {
 			break
@@ -10954,7 +10812,9 @@
 			break
 		}
 		_ = x3.Args[2]
-		p3 := x3.Args[0]
+		if p != x3.Args[0] {
+			break
+		}
 		x3_1 := x3.Args[1]
 		if x3_1.Op != OpAMD64SHRQconst || x3_1.AuxInt != 32 || w != x3_1.Args[0] {
 			break
@@ -10964,7 +10824,9 @@
 			break
 		}
 		_ = x2.Args[2]
-		p2 := x2.Args[0]
+		if p != x2.Args[0] {
+			break
+		}
 		x2_1 := x2.Args[1]
 		if x2_1.Op != OpAMD64SHRQconst || x2_1.AuxInt != 40 || w != x2_1.Args[0] {
 			break
@@ -10974,7 +10836,9 @@
 			break
 		}
 		_ = x1.Args[2]
-		p1 := x1.Args[0]
+		if p != x1.Args[0] {
+			break
+		}
 		x1_1 := x1.Args[1]
 		if x1_1.Op != OpAMD64SHRQconst || x1_1.AuxInt != 48 || w != x1_1.Args[0] {
 			break
@@ -10984,9 +10848,11 @@
 			break
 		}
 		mem := x0.Args[2]
-		p0 := x0.Args[0]
+		if p != x0.Args[0] {
+			break
+		}
 		x0_1 := x0.Args[1]
-		if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && same(p3, p4, 1) && same(p4, p5, 1) && same(p5, p6, 1) && same(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)) {
+		if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0, x1, x2, x3, x4, x5, x6)) {
 			break
 		}
 		v.reset(OpAMD64MOVQstore)
@@ -10994,16 +10860,102 @@
 		v.Aux = s
 		v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, w.Type)
 		v0.AddArg(w)
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 (SHRWconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i-1] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p7 w x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w) x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w) x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w) x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w) x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w) x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w) x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && sequentialAddresses(p3, p4, 1) && sequentialAddresses(p4, p5, 1) && sequentialAddresses(p5, p6, 1) && sequentialAddresses(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)
+	// result: (MOVQstore [i] {s} p0 (BSWAPQ <w.Type> w) mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p7 := v_0
+		w := v_1
+		x6 := v_2
+		if x6.Op != OpAMD64MOVBstore || x6.AuxInt != i || x6.Aux != s {
+			break
+		}
+		_ = x6.Args[2]
+		p6 := x6.Args[0]
+		x6_1 := x6.Args[1]
+		if x6_1.Op != OpAMD64SHRQconst || x6_1.AuxInt != 8 || w != x6_1.Args[0] {
+			break
+		}
+		x5 := x6.Args[2]
+		if x5.Op != OpAMD64MOVBstore || x5.AuxInt != i || x5.Aux != s {
+			break
+		}
+		_ = x5.Args[2]
+		p5 := x5.Args[0]
+		x5_1 := x5.Args[1]
+		if x5_1.Op != OpAMD64SHRQconst || x5_1.AuxInt != 16 || w != x5_1.Args[0] {
+			break
+		}
+		x4 := x5.Args[2]
+		if x4.Op != OpAMD64MOVBstore || x4.AuxInt != i || x4.Aux != s {
+			break
+		}
+		_ = x4.Args[2]
+		p4 := x4.Args[0]
+		x4_1 := x4.Args[1]
+		if x4_1.Op != OpAMD64SHRQconst || x4_1.AuxInt != 24 || w != x4_1.Args[0] {
+			break
+		}
+		x3 := x4.Args[2]
+		if x3.Op != OpAMD64MOVBstore || x3.AuxInt != i || x3.Aux != s {
+			break
+		}
+		_ = x3.Args[2]
+		p3 := x3.Args[0]
+		x3_1 := x3.Args[1]
+		if x3_1.Op != OpAMD64SHRQconst || x3_1.AuxInt != 32 || w != x3_1.Args[0] {
+			break
+		}
+		x2 := x3.Args[2]
+		if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i || x2.Aux != s {
+			break
+		}
+		_ = x2.Args[2]
+		p2 := x2.Args[0]
+		x2_1 := x2.Args[1]
+		if x2_1.Op != OpAMD64SHRQconst || x2_1.AuxInt != 40 || w != x2_1.Args[0] {
+			break
+		}
+		x1 := x2.Args[2]
+		if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i || x1.Aux != s {
+			break
+		}
+		_ = x1.Args[2]
+		p1 := x1.Args[0]
+		x1_1 := x1.Args[1]
+		if x1_1.Op != OpAMD64SHRQconst || x1_1.AuxInt != 48 || w != x1_1.Args[0] {
+			break
+		}
+		x0 := x1.Args[2]
+		if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
+			break
+		}
+		mem := x0.Args[2]
+		p0 := x0.Args[0]
+		x0_1 := x0.Args[1]
+		if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && sequentialAddresses(p3, p4, 1) && sequentialAddresses(p4, p5, 1) && sequentialAddresses(p5, p6, 1) && sequentialAddresses(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)) {
+			break
+		}
+		v.reset(OpAMD64MOVQstore)
+		v.AuxInt = i
+		v.Aux = s
+		v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, w.Type)
+		v0.AddArg(w)
+		v.AddArg3(p0, v0, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p (SHRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i-1] {s} p w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p := v_0
 		if v_1.Op != OpAMD64SHRWconst || v_1.AuxInt != 8 {
 			break
 		}
@@ -11013,23 +10965,22 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i - 1
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 (SHRLconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i-1] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p (SHRLconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i-1] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 8 {
 			break
 		}
@@ -11039,23 +10990,22 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i - 1
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i-1] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i-1] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 8 {
 			break
 		}
@@ -11065,95 +11015,100 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i - 1
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRWconst [8] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRWconst [8] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		w := v_1
 		x := v_2
 		if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		x_1 := x.Args[1]
-		if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRLconst [8] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRLconst [8] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		w := v_1
 		x := v_2
 		if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		x_1 := x.Args[1]
-		if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRQconst [8] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i] {s} p0 w mem)
+	// match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRQconst [8] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		w := v_1
 		x := v_2
 		if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		x_1 := x.Args[1]
-		if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHRLconst [j-8] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i-1] {s} p0 w0 mem)
+	// match: (MOVBstore [i] {s} p (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRLconst [j-8] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i-1] {s} p w0 mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRLconst {
 			break
 		}
@@ -11164,24 +11119,26 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		w0 := x.Args[1]
-		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i - 1
 		v.Aux = s
-		v.AddArg3(p0, w0, mem)
+		v.AddArg3(p, w0, mem)
 		return true
 	}
-	// match: (MOVBstore [i] {s} p1 (SHRQconst [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHRQconst [j-8] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVWstore [i-1] {s} p0 w0 mem)
+	// match: (MOVBstore [i] {s} p (SHRQconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRQconst [j-8] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVWstore [i-1] {s} p w0 mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst {
 			break
 		}
@@ -11192,14 +11149,222 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		w0 := x.Args[1]
-		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstore)
 		v.AuxInt = i - 1
 		v.Aux = s
+		v.AddArg3(p, w0, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p1 (SHRWconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRWconst || v_1.AuxInt != 8 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p1 (SHRLconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 8 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p1 (SHRQconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 8 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRWconst [8] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p0 := v_0
+		w := v_1
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p1 := x.Args[0]
+		x_1 := x.Args[1]
+		if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRLconst [8] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p0 := v_0
+		w := v_1
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p1 := x.Args[0]
+		x_1 := x.Args[1]
+		if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRQconst [8] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p0 := v_0
+		w := v_1
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p1 := x.Args[0]
+		x_1 := x.Args[1]
+		if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRLconst [j-8] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w0 mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRLconst {
+			break
+		}
+		j := v_1.AuxInt
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		w0 := x.Args[1]
+		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w0, mem)
+		return true
+	}
+	// match: (MOVBstore [i] {s} p1 (SHRQconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRQconst [j-8] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+	// result: (MOVWstore [i] {s} p0 w0 mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst {
+			break
+		}
+		j := v_1.AuxInt
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		w0 := x.Args[1]
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVWstore)
+		v.AuxInt = i
+		v.Aux = s
 		v.AddArg3(p0, w0, mem)
 		return true
 	}
@@ -11337,13 +11502,13 @@
 		v.AddArg2(ptr, mem)
 		return true
 	}
-	// match: (MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
+	// match: (MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
 	for {
 		c := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVBstoreconst {
 			break
@@ -11353,23 +11518,22 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstoreconst)
 		v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xff|ValAndOff(c).Val()<<8, ValAndOff(a).Off())
 		v.Aux = s
-		v.AddArg2(p0, mem)
+		v.AddArg2(p, mem)
 		return true
 	}
-	// match: (MOVBstoreconst [a] {s} p1 x:(MOVBstoreconst [c] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
+	// match: (MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
 	for {
 		a := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVBstoreconst {
 			break
@@ -11379,14 +11543,13 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVWstoreconst)
 		v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xff|ValAndOff(c).Val()<<8, ValAndOff(a).Off())
 		v.Aux = s
-		v.AddArg2(p0, mem)
+		v.AddArg2(p, mem)
 		return true
 	}
 	// match: (MOVBstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -11860,118 +12023,6 @@
 		v.AddArg2(base, mem)
 		return true
 	}
-	// match: (MOVLload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVLload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVLload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVLload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
 	// match: (MOVLload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
 	// cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
 	// result: (MOVLload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -12174,129 +12225,13 @@
 		v.AddArg3(base, val, mem)
 		return true
 	}
-	// match: (MOVLstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVLstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVLstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVLstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVLstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVLstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVQstore [i-4] {s} p0 w mem)
+	// match: (MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVQstore [i-4] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 32 {
 			break
 		}
@@ -12306,23 +12241,22 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVQstore)
 		v.AuxInt = i - 4
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p0 w0:(SHRQconst [j-32] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVQstore [i-4] {s} p0 w0 mem)
+	// match: (MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVQstore [i-4] {s} p w0 mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst {
 			break
 		}
@@ -12333,14 +12267,70 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		w0 := x.Args[1]
-		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVQstore)
 		v.AuxInt = i - 4
 		v.Aux = s
+		v.AddArg3(p, w0, mem)
+		return true
+	}
+	// match: (MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)
+	// result: (MOVQstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 32 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVLstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVQstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)
+	// result: (MOVQstore [i] {s} p0 w0 mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst {
+			break
+		}
+		j := v_1.AuxInt
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVLstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		w0 := x.Args[1]
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVQstore)
+		v.AuxInt = i
+		v.Aux = s
 		v.AddArg3(p0, w0, mem)
 		return true
 	}
@@ -13018,13 +13008,13 @@
 		v.AddArg2(ptr, mem)
 		return true
 	}
-	// match: (MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+	// match: (MOVLstoreconst [c] {s} p x:(MOVLstoreconst [a] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
 	for {
 		c := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVLstoreconst {
 			break
@@ -13034,8 +13024,7 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVQstore)
@@ -13043,16 +13032,16 @@
 		v.Aux = s
 		v0 := b.NewValue0(x.Pos, OpAMD64MOVQconst, typ.UInt64)
 		v0.AuxInt = ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
-	// match: (MOVLstoreconst [a] {s} p1 x:(MOVLstoreconst [c] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+	// match: (MOVLstoreconst [a] {s} p x:(MOVLstoreconst [c] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
 	for {
 		a := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVLstoreconst {
 			break
@@ -13062,8 +13051,7 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVQstore)
@@ -13071,7 +13059,7 @@
 		v.Aux = s
 		v0 := b.NewValue0(x.Pos, OpAMD64MOVQconst, typ.UInt64)
 		v0.AuxInt = ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
 	// match: (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -13423,118 +13411,6 @@
 		v.AddArg2(base, mem)
 		return true
 	}
-	// match: (MOVQload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVQload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVQload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVQload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
 	// match: (MOVQload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
 	// cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
 	// result: (MOVQload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -13614,7 +13490,6 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
 	// match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem)
 	// cond: is32Bit(off1+off2)
 	// result: (MOVQstore [off1+off2] {sym} ptr val mem)
@@ -13681,122 +13556,6 @@
 		v.AddArg3(base, val, mem)
 		return true
 	}
-	// match: (MOVQstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVQstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVQstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVQstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVQstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVQstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
 	// match: (MOVQstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
 	// cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
 	// result: (MOVQstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
@@ -14430,13 +14189,13 @@
 		v.AddArg2(ptr, mem)
 		return true
 	}
-	// match: (MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [c2] {s} p0 mem))
-	// cond: config.useSSE && x.Uses == 1 && same(p0, p1, 1) && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)
-	// result: (MOVOstore [ValAndOff(c2).Off()] {s} p0 (MOVOconst [0]) mem)
+	// match: (MOVQstoreconst [c] {s} p x:(MOVQstoreconst [c2] {s} p mem))
+	// cond: config.useSSE && x.Uses == 1 && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)
+	// result: (MOVOstore [ValAndOff(c2).Off()] {s} p (MOVOconst [0]) mem)
 	for {
 		c := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVQstoreconst {
 			break
@@ -14446,8 +14205,7 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(config.useSSE && x.Uses == 1 && same(p0, p1, 1) && ValAndOff(c2).Off()+8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)) {
+		if p != x.Args[0] || !(config.useSSE && x.Uses == 1 && ValAndOff(c2).Off()+8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVOstore)
@@ -14455,7 +14213,7 @@
 		v.Aux = s
 		v0 := b.NewValue0(x.Pos, OpAMD64MOVOconst, types.TypeInt128)
 		v0.AuxInt = 0
-		v.AddArg3(p0, v0, mem)
+		v.AddArg3(p, v0, mem)
 		return true
 	}
 	// match: (MOVQstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -15119,118 +14877,6 @@
 		v.AddArg2(base, mem)
 		return true
 	}
-	// match: (MOVWload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVWload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVWload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
-	// match: (MOVWload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		mem := v_1
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWload)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg2(v0, mem)
-		return true
-	}
 	// match: (MOVWload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
 	// cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
 	// result: (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -15416,129 +15062,13 @@
 		v.AddArg3(base, val, mem)
 		return true
 	}
-	// match: (MOVWstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ1 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVWstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ2 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVWstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ4 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVWstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-	// cond: i1 != 0 && is32Bit(i0+i1)
-	// result: (MOVWstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-	for {
-		i0 := v.AuxInt
-		s0 := v.Aux
-		l := v_0
-		if l.Op != OpAMD64LEAQ8 {
-			break
-		}
-		i1 := l.AuxInt
-		s1 := l.Aux
-		y := l.Args[1]
-		x := l.Args[0]
-		val := v_1
-		mem := v_2
-		if !(i1 != 0 && is32Bit(i0+i1)) {
-			break
-		}
-		v.reset(OpAMD64MOVWstore)
-		v.AuxInt = i0 + i1
-		v.Aux = s0
-		v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-		v0.AuxInt = 0
-		v0.Aux = s1
-		v0.AddArg2(x, y)
-		v.AddArg3(v0, val, mem)
-		return true
-	}
-	// match: (MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVLstore [i-2] {s} p0 w mem)
+	// match: (MOVWstore [i] {s} p (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVLstore [i-2] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 16 {
 			break
 		}
@@ -15548,23 +15078,22 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstore)
 		v.AuxInt = i - 2
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVWstore [i] {s} p1 (SHRQconst [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVLstore [i-2] {s} p0 w mem)
+	// match: (MOVWstore [i] {s} p (SHRQconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVLstore [i-2] {s} p w mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 16 {
 			break
 		}
@@ -15574,23 +15103,22 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
-		if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstore)
 		v.AuxInt = i - 2
 		v.Aux = s
-		v.AddArg3(p0, w, mem)
+		v.AddArg3(p, w, mem)
 		return true
 	}
-	// match: (MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHRLconst [j-16] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVLstore [i-2] {s} p0 w0 mem)
+	// match: (MOVWstore [i] {s} p (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRLconst [j-16] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVLstore [i-2] {s} p w0 mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRLconst {
 			break
 		}
@@ -15601,24 +15129,26 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		w0 := x.Args[1]
-		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstore)
 		v.AuxInt = i - 2
 		v.Aux = s
-		v.AddArg3(p0, w0, mem)
+		v.AddArg3(p, w0, mem)
 		return true
 	}
-	// match: (MOVWstore [i] {s} p1 (SHRQconst [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHRQconst [j-16] w) mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-	// result: (MOVLstore [i-2] {s} p0 w0 mem)
+	// match: (MOVWstore [i] {s} p (SHRQconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRQconst [j-16] w) mem))
+	// cond: x.Uses == 1 && clobber(x)
+	// result: (MOVLstore [i-2] {s} p w0 mem)
 	for {
 		i := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		if v_1.Op != OpAMD64SHRQconst {
 			break
 		}
@@ -15629,14 +15159,124 @@
 			break
 		}
 		mem := x.Args[2]
-		p0 := x.Args[0]
+		if p != x.Args[0] {
+			break
+		}
 		w0 := x.Args[1]
-		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstore)
 		v.AuxInt = i - 2
 		v.Aux = s
+		v.AddArg3(p, w0, mem)
+		return true
+	}
+	// match: (MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+	// result: (MOVLstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 16 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVLstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVWstore [i] {s} p1 (SHRQconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+	// result: (MOVLstore [i] {s} p0 w mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 16 {
+			break
+		}
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVLstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w, mem)
+		return true
+	}
+	// match: (MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRLconst [j-16] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+	// result: (MOVLstore [i] {s} p0 w0 mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRLconst {
+			break
+		}
+		j := v_1.AuxInt
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		w0 := x.Args[1]
+		if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVLstore)
+		v.AuxInt = i
+		v.Aux = s
+		v.AddArg3(p0, w0, mem)
+		return true
+	}
+	// match: (MOVWstore [i] {s} p1 (SHRQconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRQconst [j-16] w) mem))
+	// cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+	// result: (MOVLstore [i] {s} p0 w0 mem)
+	for {
+		i := v.AuxInt
+		s := v.Aux
+		p1 := v_0
+		if v_1.Op != OpAMD64SHRQconst {
+			break
+		}
+		j := v_1.AuxInt
+		w := v_1.Args[0]
+		x := v_2
+		if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
+			break
+		}
+		mem := x.Args[2]
+		p0 := x.Args[0]
+		w0 := x.Args[1]
+		if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
+			break
+		}
+		v.reset(OpAMD64MOVLstore)
+		v.AuxInt = i
+		v.Aux = s
 		v.AddArg3(p0, w0, mem)
 		return true
 	}
@@ -15774,13 +15414,13 @@
 		v.AddArg2(ptr, mem)
 		return true
 	}
-	// match: (MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
+	// match: (MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
 	for {
 		c := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVWstoreconst {
 			break
@@ -15790,23 +15430,22 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstoreconst)
 		v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xffff|ValAndOff(c).Val()<<16, ValAndOff(a).Off())
 		v.Aux = s
-		v.AddArg2(p0, mem)
+		v.AddArg2(p, mem)
 		return true
 	}
-	// match: (MOVWstoreconst [a] {s} p1 x:(MOVWstoreconst [c] {s} p0 mem))
-	// cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
-	// result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
+	// match: (MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
+	// cond: x.Uses == 1 && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
+	// result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
 	for {
 		a := v.AuxInt
 		s := v.Aux
-		p1 := v_0
+		p := v_0
 		x := v_1
 		if x.Op != OpAMD64MOVWstoreconst {
 			break
@@ -15816,14 +15455,13 @@
 			break
 		}
 		mem := x.Args[1]
-		p0 := x.Args[0]
-		if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
+		if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
 			break
 		}
 		v.reset(OpAMD64MOVLstoreconst)
 		v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xffff|ValAndOff(c).Val()<<16, ValAndOff(a).Off())
 		v.Aux = s
-		v.AddArg2(p0, mem)
+		v.AddArg2(p, mem)
 		return true
 	}
 	// match: (MOVWstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -17887,9 +17525,9 @@
 		v.copyOf(x)
 		return true
 	}
-	// match: (ORL x0:(MOVBload [i0] {s} p0 mem) sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+	// match: (ORL x0:(MOVBload [i0] {s} p mem) sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p mem)))
+	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x0 := v_0
@@ -17899,7 +17537,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
 				continue
@@ -17913,8 +17551,7 @@
 				continue
 			}
 			_ = x1.Args[1]
-			p1 := x1.Args[0]
-			if mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -17922,14 +17559,50 @@
 			v.copyOf(v0)
 			v0.AuxInt = i0
 			v0.Aux = s
+			v0.AddArg2(p, mem)
+			return true
+		}
+		break
+	}
+	// match: (ORL x0:(MOVBload [i] {s} p0 mem) sh:(SHLLconst [8] x1:(MOVBload [i] {s} p1 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x0 := v_0
+			if x0.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
+				continue
+			}
+			x1 := sh.Args[0]
+			if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+				continue
+			}
+			_ = x1.Args[1]
+			p1 := x1.Args[0]
+			if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+			v.copyOf(v0)
+			v0.AuxInt = i
+			v0.Aux = s
 			v0.AddArg2(p0, mem)
 			return true
 		}
 		break
 	}
-	// match: (ORL x0:(MOVWload [i0] {s} p0 mem) sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+	// match: (ORL x0:(MOVWload [i0] {s} p mem) sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p mem)))
+	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x0 := v_0
@@ -17939,7 +17612,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
 				continue
@@ -17953,8 +17626,7 @@
 				continue
 			}
 			_ = x1.Args[1]
-			p1 := x1.Args[0]
-			if mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -17962,14 +17634,50 @@
 			v.copyOf(v0)
 			v0.AuxInt = i0
 			v0.Aux = s
+			v0.AddArg2(p, mem)
+			return true
+		}
+		break
+	}
+	// match: (ORL x0:(MOVWload [i] {s} p0 mem) sh:(SHLLconst [16] x1:(MOVWload [i] {s} p1 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x0 := v_0
+			if x0.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
+				continue
+			}
+			x1 := sh.Args[0]
+			if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+				continue
+			}
+			_ = x1.Args[1]
+			p1 := x1.Args[0]
+			if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+			v.copyOf(v0)
+			v0.AuxInt = i
+			v0.Aux = s
 			v0.AddArg2(p0, mem)
 			return true
 		}
 		break
 	}
-	// match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p0 mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p1 mem)) y))
-	// cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+	// match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p mem)) y))
+	// cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s1 := v_0
@@ -17984,7 +17692,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORL {
 				continue
@@ -18007,12 +17715,11 @@
 					continue
 				}
 				_ = x0.Args[1]
-				p1 := x0.Args[0]
-				if mem != x0.Args[1] {
+				if p != x0.Args[0] || mem != x0.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+				if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -18023,6 +17730,66 @@
 				v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
 				v2.AuxInt = i0
 				v2.Aux = s
+				v2.AddArg2(p, mem)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i] {s} p1 mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i] {s} p0 mem)) y))
+	// cond: j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s1 := v_0
+			if s1.Op != OpAMD64SHLLconst {
+				continue
+			}
+			j1 := s1.AuxInt
+			x1 := s1.Args[0]
+			if x1.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORL {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s0 := or_0
+				if s0.Op != OpAMD64SHLLconst {
+					continue
+				}
+				j0 := s0.AuxInt
+				x0 := s0.Args[0]
+				if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+					continue
+				}
+				_ = x0.Args[1]
+				p0 := x0.Args[0]
+				if mem != x0.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x0.Pos, OpAMD64ORL, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x0.Pos, OpAMD64SHLLconst, v.Type)
+				v1.AuxInt = j0
+				v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+				v2.AuxInt = i
+				v2.Aux = s
 				v2.AddArg2(p0, mem)
 				v1.AddArg(v2)
 				v0.AddArg2(v1, y)
@@ -18031,9 +17798,9 @@
 		}
 		break
 	}
-	// match: (ORL x1:(MOVBload [i1] {s} p0 mem) sh:(SHLLconst [8] x0:(MOVBload [i0] {s} p1 mem)))
-	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+	// match: (ORL x1:(MOVBload [i1] {s} p mem) sh:(SHLLconst [8] x0:(MOVBload [i0] {s} p mem)))
+	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x1 := v_0
@@ -18043,7 +17810,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
 				continue
@@ -18057,8 +17824,7 @@
 				continue
 			}
 			_ = x0.Args[1]
-			p1 := x0.Args[0]
-			if mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -18068,15 +17834,54 @@
 			v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
 			v1.AuxInt = i0
 			v1.Aux = s
+			v1.AddArg2(p, mem)
+			v0.AddArg(v1)
+			return true
+		}
+		break
+	}
+	// match: (ORL x1:(MOVBload [i] {s} p1 mem) sh:(SHLLconst [8] x0:(MOVBload [i] {s} p0 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x1 := v_0
+			if x1.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
+				continue
+			}
+			x0 := sh.Args[0]
+			if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+				continue
+			}
+			_ = x0.Args[1]
+			p0 := x0.Args[0]
+			if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, v.Type)
+			v.copyOf(v0)
+			v0.AuxInt = 8
+			v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+			v1.AuxInt = i
+			v1.Aux = s
 			v1.AddArg2(p0, mem)
 			v0.AddArg(v1)
 			return true
 		}
 		break
 	}
-	// match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+	// match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			r1 := v_0
@@ -18090,7 +17895,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
 				continue
@@ -18108,8 +17913,7 @@
 				continue
 			}
 			_ = x0.Args[1]
-			p1 := x0.Args[0]
-			if mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+			if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -18118,15 +17922,61 @@
 			v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
 			v1.AuxInt = i0
 			v1.Aux = s
+			v1.AddArg2(p, mem)
+			v0.AddArg(v1)
+			return true
+		}
+		break
+	}
+	// match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			r1 := v_0
+			if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+				continue
+			}
+			x1 := r1.Args[0]
+			if x1.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
+				continue
+			}
+			r0 := sh.Args[0]
+			if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+				continue
+			}
+			x0 := r0.Args[0]
+			if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
+				continue
+			}
+			_ = x0.Args[1]
+			p0 := x0.Args[0]
+			if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, v.Type)
+			v.copyOf(v0)
+			v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
+			v1.AuxInt = i
+			v1.Aux = s
 			v1.AddArg2(p0, mem)
 			v0.AddArg(v1)
 			return true
 		}
 		break
 	}
-	// match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p0 mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p1 mem)) y))
-	// cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+	// match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p mem)) y))
+	// cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s0 := v_0
@@ -18141,7 +17991,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORL {
 				continue
@@ -18164,12 +18014,11 @@
 					continue
 				}
 				_ = x1.Args[1]
-				p1 := x1.Args[0]
-				if mem != x1.Args[1] {
+				if p != x1.Args[0] || mem != x1.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+				if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -18182,6 +18031,69 @@
 				v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
 				v3.AuxInt = i0
 				v3.Aux = s
+				v3.AddArg2(p, mem)
+				v2.AddArg(v3)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i] {s} p0 mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i] {s} p1 mem)) y))
+	// cond: j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s0 := v_0
+			if s0.Op != OpAMD64SHLLconst {
+				continue
+			}
+			j0 := s0.AuxInt
+			x0 := s0.Args[0]
+			if x0.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORL {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s1 := or_0
+				if s1.Op != OpAMD64SHLLconst {
+					continue
+				}
+				j1 := s1.AuxInt
+				x1 := s1.Args[0]
+				if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+					continue
+				}
+				_ = x1.Args[1]
+				p1 := x1.Args[0]
+				if mem != x1.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x1.Pos, OpAMD64ORL, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x1.Pos, OpAMD64SHLLconst, v.Type)
+				v1.AuxInt = j1
+				v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
+				v2.AuxInt = 8
+				v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+				v3.AuxInt = i
+				v3.Aux = s
 				v3.AddArg2(p0, mem)
 				v2.AddArg(v3)
 				v1.AddArg(v2)
@@ -18804,9 +18716,9 @@
 		v.copyOf(x)
 		return true
 	}
-	// match: (ORQ x0:(MOVBload [i0] {s} p0 mem) sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+	// match: (ORQ x0:(MOVBload [i0] {s} p mem) sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p mem)))
+	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x0 := v_0
@@ -18816,7 +18728,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
 				continue
@@ -18830,8 +18742,7 @@
 				continue
 			}
 			_ = x1.Args[1]
-			p1 := x1.Args[0]
-			if mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -18839,14 +18750,50 @@
 			v.copyOf(v0)
 			v0.AuxInt = i0
 			v0.Aux = s
+			v0.AddArg2(p, mem)
+			return true
+		}
+		break
+	}
+	// match: (ORQ x0:(MOVBload [i] {s} p0 mem) sh:(SHLQconst [8] x1:(MOVBload [i] {s} p1 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x0 := v_0
+			if x0.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
+				continue
+			}
+			x1 := sh.Args[0]
+			if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+				continue
+			}
+			_ = x1.Args[1]
+			p1 := x1.Args[0]
+			if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+			v.copyOf(v0)
+			v0.AuxInt = i
+			v0.Aux = s
 			v0.AddArg2(p0, mem)
 			return true
 		}
 		break
 	}
-	// match: (ORQ x0:(MOVWload [i0] {s} p0 mem) sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+	// match: (ORQ x0:(MOVWload [i0] {s} p mem) sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p mem)))
+	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x0 := v_0
@@ -18856,7 +18803,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
 				continue
@@ -18870,8 +18817,7 @@
 				continue
 			}
 			_ = x1.Args[1]
-			p1 := x1.Args[0]
-			if mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -18879,14 +18825,50 @@
 			v.copyOf(v0)
 			v0.AuxInt = i0
 			v0.Aux = s
+			v0.AddArg2(p, mem)
+			return true
+		}
+		break
+	}
+	// match: (ORQ x0:(MOVWload [i] {s} p0 mem) sh:(SHLQconst [16] x1:(MOVWload [i] {s} p1 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x0 := v_0
+			if x0.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
+				continue
+			}
+			x1 := sh.Args[0]
+			if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+				continue
+			}
+			_ = x1.Args[1]
+			p1 := x1.Args[0]
+			if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+			v.copyOf(v0)
+			v0.AuxInt = i
+			v0.Aux = s
 			v0.AddArg2(p0, mem)
 			return true
 		}
 		break
 	}
-	// match: (ORQ x0:(MOVLload [i0] {s} p0 mem) sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p1 mem)))
-	// cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (MOVQload [i0] {s} p0 mem)
+	// match: (ORQ x0:(MOVLload [i0] {s} p mem) sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
+	// cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x0 := v_0
@@ -18896,7 +18878,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
 				continue
@@ -18910,8 +18892,7 @@
 				continue
 			}
 			_ = x1.Args[1]
-			p1 := x1.Args[0]
-			if mem != x1.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -18919,14 +18900,50 @@
 			v.copyOf(v0)
 			v0.AuxInt = i0
 			v0.Aux = s
+			v0.AddArg2(p, mem)
+			return true
+		}
+		break
+	}
+	// match: (ORQ x0:(MOVLload [i] {s} p0 mem) sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x0 := v_0
+			if x0.Op != OpAMD64MOVLload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
+				continue
+			}
+			x1 := sh.Args[0]
+			if x1.Op != OpAMD64MOVLload || x1.AuxInt != i || x1.Aux != s {
+				continue
+			}
+			_ = x1.Args[1]
+			p1 := x1.Args[0]
+			if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x1.Pos, OpAMD64MOVQload, typ.UInt64)
+			v.copyOf(v0)
+			v0.AuxInt = i
+			v0.Aux = s
 			v0.AddArg2(p0, mem)
 			return true
 		}
 		break
 	}
-	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p0 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p1 mem)) y))
-	// cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p mem)) y))
+	// cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s1 := v_0
@@ -18941,7 +18958,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORQ {
 				continue
@@ -18964,12 +18981,11 @@
 					continue
 				}
 				_ = x0.Args[1]
-				p1 := x0.Args[0]
-				if mem != x0.Args[1] {
+				if p != x0.Args[0] || mem != x0.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+				if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -18980,6 +18996,66 @@
 				v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
 				v2.AuxInt = i0
 				v2.Aux = s
+				v2.AddArg2(p, mem)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i] {s} p1 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i] {s} p0 mem)) y))
+	// cond: j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s1 := v_0
+			if s1.Op != OpAMD64SHLQconst {
+				continue
+			}
+			j1 := s1.AuxInt
+			x1 := s1.Args[0]
+			if x1.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORQ {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s0 := or_0
+				if s0.Op != OpAMD64SHLQconst {
+					continue
+				}
+				j0 := s0.AuxInt
+				x0 := s0.Args[0]
+				if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+					continue
+				}
+				_ = x0.Args[1]
+				p0 := x0.Args[0]
+				if mem != x0.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x0.Pos, OpAMD64ORQ, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
+				v1.AuxInt = j0
+				v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+				v2.AuxInt = i
+				v2.Aux = s
 				v2.AddArg2(p0, mem)
 				v1.AddArg(v2)
 				v0.AddArg2(v1, y)
@@ -18988,9 +19064,9 @@
 		}
 		break
 	}
-	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p0 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p1 mem)) y))
-	// cond: i1 == i0+2 && j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p0 mem)) y)
+	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem)) y))
+	// cond: i1 == i0+2 && j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s1 := v_0
@@ -19005,7 +19081,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORQ {
 				continue
@@ -19028,12 +19104,11 @@
 					continue
 				}
 				_ = x0.Args[1]
-				p1 := x0.Args[0]
-				if mem != x0.Args[1] {
+				if p != x0.Args[0] || mem != x0.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+2 && j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+				if !(i1 == i0+2 && j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -19044,6 +19119,66 @@
 				v2 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
 				v2.AuxInt = i0
 				v2.Aux = s
+				v2.AddArg2(p, mem)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem)) y))
+	// cond: j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s1 := v_0
+			if s1.Op != OpAMD64SHLQconst {
+				continue
+			}
+			j1 := s1.AuxInt
+			x1 := s1.Args[0]
+			if x1.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORQ {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s0 := or_0
+				if s0.Op != OpAMD64SHLQconst {
+					continue
+				}
+				j0 := s0.AuxInt
+				x0 := s0.Args[0]
+				if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
+					continue
+				}
+				_ = x0.Args[1]
+				p0 := x0.Args[0]
+				if mem != x0.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x0.Pos, OpAMD64ORQ, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
+				v1.AuxInt = j0
+				v2 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
+				v2.AuxInt = i
+				v2.Aux = s
 				v2.AddArg2(p0, mem)
 				v1.AddArg(v2)
 				v0.AddArg2(v1, y)
@@ -19052,9 +19187,9 @@
 		}
 		break
 	}
-	// match: (ORQ x1:(MOVBload [i1] {s} p0 mem) sh:(SHLQconst [8] x0:(MOVBload [i0] {s} p1 mem)))
-	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+	// match: (ORQ x1:(MOVBload [i1] {s} p mem) sh:(SHLQconst [8] x0:(MOVBload [i0] {s} p mem)))
+	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x1 := v_0
@@ -19064,7 +19199,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
 				continue
@@ -19078,8 +19213,7 @@
 				continue
 			}
 			_ = x0.Args[1]
-			p1 := x0.Args[0]
-			if mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+			if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -19089,15 +19223,54 @@
 			v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
 			v1.AuxInt = i0
 			v1.Aux = s
+			v1.AddArg2(p, mem)
+			v0.AddArg(v1)
+			return true
+		}
+		break
+	}
+	// match: (ORQ x1:(MOVBload [i] {s} p1 mem) sh:(SHLQconst [8] x0:(MOVBload [i] {s} p0 mem)))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+	// result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x1 := v_0
+			if x1.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
+				continue
+			}
+			x0 := sh.Args[0]
+			if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+				continue
+			}
+			_ = x0.Args[1]
+			p0 := x0.Args[0]
+			if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, v.Type)
+			v.copyOf(v0)
+			v0.AuxInt = 8
+			v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+			v1.AuxInt = i
+			v1.Aux = s
 			v1.AddArg2(p0, mem)
 			v0.AddArg(v1)
 			return true
 		}
 		break
 	}
-	// match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+	// match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+	// cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			r1 := v_0
@@ -19111,7 +19284,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
 				continue
@@ -19129,8 +19302,7 @@
 				continue
 			}
 			_ = x0.Args[1]
-			p1 := x0.Args[0]
-			if mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+			if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -19139,15 +19311,61 @@
 			v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
 			v1.AuxInt = i0
 			v1.Aux = s
+			v1.AddArg2(p, mem)
+			v0.AddArg(v1)
+			return true
+		}
+		break
+	}
+	// match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			r1 := v_0
+			if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+				continue
+			}
+			x1 := r1.Args[0]
+			if x1.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
+				continue
+			}
+			r0 := sh.Args[0]
+			if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+				continue
+			}
+			x0 := r0.Args[0]
+			if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
+				continue
+			}
+			_ = x0.Args[1]
+			p0 := x0.Args[0]
+			if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, v.Type)
+			v.copyOf(v0)
+			v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
+			v1.AuxInt = i
+			v1.Aux = s
 			v1.AddArg2(p0, mem)
 			v0.AddArg(v1)
 			return true
 		}
 		break
 	}
-	// match: (ORQ r1:(BSWAPL x1:(MOVLload [i1] {s} p0 mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p1 mem))))
-	// cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-	// result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p0 mem))
+	// match: (ORQ r1:(BSWAPL x1:(MOVLload [i1] {s} p mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
+	// cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			r1 := v_0
@@ -19161,7 +19379,7 @@
 			i1 := x1.AuxInt
 			s := x1.Aux
 			mem := x1.Args[1]
-			p0 := x1.Args[0]
+			p := x1.Args[0]
 			sh := v_1
 			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
 				continue
@@ -19179,8 +19397,7 @@
 				continue
 			}
 			_ = x0.Args[1]
-			p1 := x0.Args[0]
-			if mem != x0.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+			if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
 				continue
 			}
 			b = mergePoint(b, x0, x1)
@@ -19189,15 +19406,61 @@
 			v1 := b.NewValue0(x0.Pos, OpAMD64MOVQload, typ.UInt64)
 			v1.AuxInt = i0
 			v1.Aux = s
+			v1.AddArg2(p, mem)
+			v0.AddArg(v1)
+			return true
+		}
+		break
+	}
+	// match: (ORQ r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+	// cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+	// result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			r1 := v_0
+			if r1.Op != OpAMD64BSWAPL {
+				continue
+			}
+			x1 := r1.Args[0]
+			if x1.Op != OpAMD64MOVLload {
+				continue
+			}
+			i := x1.AuxInt
+			s := x1.Aux
+			mem := x1.Args[1]
+			p1 := x1.Args[0]
+			sh := v_1
+			if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
+				continue
+			}
+			r0 := sh.Args[0]
+			if r0.Op != OpAMD64BSWAPL {
+				continue
+			}
+			x0 := r0.Args[0]
+			if x0.Op != OpAMD64MOVLload || x0.AuxInt != i || x0.Aux != s {
+				continue
+			}
+			_ = x0.Args[1]
+			p0 := x0.Args[0]
+			if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+				continue
+			}
+			b = mergePoint(b, x0, x1)
+			v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, v.Type)
+			v.copyOf(v0)
+			v1 := b.NewValue0(x0.Pos, OpAMD64MOVQload, typ.UInt64)
+			v1.AuxInt = i
+			v1.Aux = s
 			v1.AddArg2(p0, mem)
 			v0.AddArg(v1)
 			return true
 		}
 		break
 	}
-	// match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p0 mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p1 mem)) y))
-	// cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+	// match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p mem)) y))
+	// cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s0 := v_0
@@ -19212,7 +19475,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORQ {
 				continue
@@ -19235,12 +19498,11 @@
 					continue
 				}
 				_ = x1.Args[1]
-				p1 := x1.Args[0]
-				if mem != x1.Args[1] {
+				if p != x1.Args[0] || mem != x1.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+				if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -19253,6 +19515,69 @@
 				v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
 				v3.AuxInt = i0
 				v3.Aux = s
+				v3.AddArg2(p, mem)
+				v2.AddArg(v3)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i] {s} p0 mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i] {s} p1 mem)) y))
+	// cond: j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s0 := v_0
+			if s0.Op != OpAMD64SHLQconst {
+				continue
+			}
+			j0 := s0.AuxInt
+			x0 := s0.Args[0]
+			if x0.Op != OpAMD64MOVBload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORQ {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s1 := or_0
+				if s1.Op != OpAMD64SHLQconst {
+					continue
+				}
+				j1 := s1.AuxInt
+				x1 := s1.Args[0]
+				if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+					continue
+				}
+				_ = x1.Args[1]
+				p1 := x1.Args[0]
+				if mem != x1.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x1.Pos, OpAMD64ORQ, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x1.Pos, OpAMD64SHLQconst, v.Type)
+				v1.AuxInt = j1
+				v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
+				v2.AuxInt = 8
+				v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+				v3.AuxInt = i
+				v3.Aux = s
 				v3.AddArg2(p0, mem)
 				v2.AddArg(v3)
 				v1.AddArg(v2)
@@ -19262,9 +19587,9 @@
 		}
 		break
 	}
-	// match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p0 mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p1 mem))) y))
-	// cond: i1 == i0+2 && j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
-	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p0 mem))) y)
+	// match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))) y))
+	// cond: i1 == i0+2 && j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			s0 := v_0
@@ -19283,7 +19608,7 @@
 			i0 := x0.AuxInt
 			s := x0.Aux
 			mem := x0.Args[1]
-			p0 := x0.Args[0]
+			p := x0.Args[0]
 			or := v_1
 			if or.Op != OpAMD64ORQ {
 				continue
@@ -19310,12 +19635,11 @@
 					continue
 				}
 				_ = x1.Args[1]
-				p1 := x1.Args[0]
-				if mem != x1.Args[1] {
+				if p != x1.Args[0] || mem != x1.Args[1] {
 					continue
 				}
 				y := or_1
-				if !(i1 == i0+2 && j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
+				if !(i1 == i0+2 && j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
 					continue
 				}
 				b = mergePoint(b, x0, x1, y)
@@ -19327,6 +19651,76 @@
 				v3 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
 				v3.AuxInt = i0
 				v3.Aux = s
+				v3.AddArg2(p, mem)
+				v2.AddArg(v3)
+				v1.AddArg(v2)
+				v0.AddArg2(v1, y)
+				return true
+			}
+		}
+		break
+	}
+	// match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))) y))
+	// cond: j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
+	// result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			s0 := v_0
+			if s0.Op != OpAMD64SHLQconst {
+				continue
+			}
+			j0 := s0.AuxInt
+			r0 := s0.Args[0]
+			if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+				continue
+			}
+			x0 := r0.Args[0]
+			if x0.Op != OpAMD64MOVWload {
+				continue
+			}
+			i := x0.AuxInt
+			s := x0.Aux
+			mem := x0.Args[1]
+			p0 := x0.Args[0]
+			or := v_1
+			if or.Op != OpAMD64ORQ {
+				continue
+			}
+			_ = or.Args[1]
+			or_0 := or.Args[0]
+			or_1 := or.Args[1]
+			for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+				s1 := or_0
+				if s1.Op != OpAMD64SHLQconst {
+					continue
+				}
+				j1 := s1.AuxInt
+				r1 := s1.Args[0]
+				if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+					continue
+				}
+				x1 := r1.Args[0]
+				if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+					continue
+				}
+				_ = x1.Args[1]
+				p1 := x1.Args[0]
+				if mem != x1.Args[1] {
+					continue
+				}
+				y := or_1
+				if !(j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
+					continue
+				}
+				b = mergePoint(b, x0, x1, y)
+				v0 := b.NewValue0(x1.Pos, OpAMD64ORQ, v.Type)
+				v.copyOf(v0)
+				v1 := b.NewValue0(x1.Pos, OpAMD64SHLQconst, v.Type)
+				v1.AuxInt = j1
+				v2 := b.NewValue0(x1.Pos, OpAMD64BSWAPL, typ.UInt32)
+				v3 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+				v3.AuxInt = i
+				v3.Aux = s
 				v3.AddArg2(p0, mem)
 				v2.AddArg(v3)
 				v1.AddArg(v2)
diff --git a/test/codegen/memcombine.go b/test/codegen/memcombine.go
index e2d703c..6ad9514 100644
--- a/test/codegen/memcombine.go
+++ b/test/codegen/memcombine.go
@@ -160,14 +160,14 @@
 
 func load_be_byte2_uint16(s []byte) uint16 {
 	// arm64:`MOVHU\t\(R[0-9]+\)`,`REV16W`,-`ORR`,-`MOVB`
-	// amd64:`MOVWLZX\s\([A-Z]+\)`,-`MOVB`,-`OR`
+	// amd64:`MOVWLZX\s\([A-Z]+\)`,`ROLW`,-`MOVB`,-`OR`
 	// ppc64le:`MOVHBR\t\(R[0-9]+\)`,-`MOVBZ`
 	return uint16(s[0])<<8 | uint16(s[1])
 }
 
 func load_be_byte2_uint16_inv(s []byte) uint16 {
 	// arm64:`MOVHU\t\(R[0-9]+\)`,`REV16W`,-`ORR`,-`MOVB`
-	// amd64:`MOVWLZX\s\([A-Z]+\)`,-`MOVB`,-`OR`
+	// amd64:`MOVWLZX\s\([A-Z]+\)`,`ROLW`,-`MOVB`,-`OR`
 	// ppc64le:`MOVHBR\t\(R[0-9]+\)`,-`MOVBZ`
 	return uint16(s[1]) | uint16(s[0])<<8
 }
@@ -179,7 +179,7 @@
 
 func load_be_byte4_uint32_inv(s []byte) uint32 {
 	// arm64:`MOVWU\t\(R[0-9]+\)`,`REVW`,-`ORR`,-`REV16W`,-`MOV[BH]`
-	// amd64:`MOVL\s\([A-Z]+\)`,-`MOV[BW]`,-`OR`
+	// amd64:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
 	return uint32(s[3]) | uint32(s[2])<<8 | uint32(s[1])<<16 | uint32(s[0])<<24
 }
 
@@ -191,7 +191,7 @@
 
 func load_be_byte8_uint64_inv(s []byte) uint64 {
 	// arm64:`MOVD\t\(R[0-9]+\)`,`REV`,-`ORR`,-`REVW`,-`REV16W`,-`MOV[BHW]`
-	// amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,-`MOV[BWL]\t[^$]`,-`OR`
+	// amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
 	// ppc64le:`MOVDBR\t\(R[0-9]+\)`,-`MOV[BHW]Z`
 	return uint64(s[7]) | uint64(s[6])<<8 | uint64(s[5])<<16 | uint64(s[4])<<24 | uint64(s[3])<<32 | uint64(s[2])<<40 | uint64(s[1])<<48 | uint64(s[0])<<56
 }