[release-branch.go1.16] cmd/compile: fix long RMW bit operations on AMD64

Under certain circumstances, the existing rules for bit operations can
produce code that writes beyond its intended bounds. For example,
consider the following code:

    func repro(b []byte, addr, bit int32) {
	    _ = b[3]
	    v := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 | 1<<(bit&31)
	    b[0] = byte(v)
	    b[1] = byte(v >> 8)
	    b[2] = byte(v >> 16)
	    b[3] = byte(v >> 24)
    }

Roughly speaking:

1. The expression `1 << (bit & 31)` is rewritten into `(SHLL 1 bit)`
2. The expression `uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
   uint32(b[3])<<24` is rewritten into `(MOVLload &b[0])`
3. The statements `b[0] = byte(v) ... b[3] = byte(v >> 24)` are
   rewritten into `(MOVLstore &b[0], v)`
4. `(ORL (SHLL 1, bit) (MOVLload &b[0]))` is rewritten into
   `(BTSL (MOVLload &b[0]) bit)`. This is a valid transformation because
   the destination is a register: in this case, the bit offset is masked
   by the number of bits in the destination register. This is identical
   to the masking performed by `SHL`.
5. `(MOVLstore &b[0] (BTSL (MOVLload &b[0]) bit))` is rewritten into
   `(BTSLmodify &b[0] bit)`. This is an invalid transformation because
   the destination is memory: in this case, the bit offset is not
   masked, and the chosen instruction may write outside its intended
   32-bit location.

These changes fix the invalid rewrite performed in step (5) by
explicitly maksing the bit offset operand to `BT(S|R|C)(L|Q)modify`. In
the example above, the adjusted rules produce
`(BTSLmodify &b[0] (ANDLconst [31] bit))` in step (5).

These changes also add several new rules to rewrite bit sets, toggles,
and clears that are rooted at `(OR|XOR|AND)(L|Q)modify` operators into
appropriate `BT(S|R|C)(L|Q)modify` operators. These rules catch cases
where `MOV(L|Q)store ((OR|XOR|AND)(L|Q) ...)` is rewritten to
`(OR|XOR|AND)(L|Q)modify` before the `(OR|XOR|AND)(L|Q) ...` can be
rewritten to `BT(S|R|C)(L|Q) ...`.

Overall, compilecmp reports small improvements in code size on
darwin/amd64 when the changes to the compiler itself are exlcuded:

file                               before   after    Δ       %
runtime.s                          536464   536412   -52     -0.010%
bytes.s                            32629    32593    -36     -0.110%
strings.s                          44565    44529    -36     -0.081%
os/signal.s                        7967     7959     -8      -0.100%
cmd/vendor/golang.org/x/sys/unix.s 81686    81678    -8      -0.010%
math/big.s                         188235   188253   +18     +0.010%
cmd/link/internal/loader.s         89295    89056    -239    -0.268%
cmd/link/internal/ld.s             633551   633232   -319    -0.050%
cmd/link/internal/arm.s            18934    18928    -6      -0.032%
cmd/link/internal/arm64.s          31814    31801    -13     -0.041%
cmd/link/internal/riscv64.s        7347     7345     -2      -0.027%
cmd/compile/internal/ssa.s         4029173  4033066  +3893   +0.097%
total                              21298280 21301472 +3192   +0.015%

Fixes #45253

Change-Id: I2e560548b515865129e1724e150e30540e9d29ce
GitHub-Last-Rev: ab94ede1d097f920a9d1d3da403c8e4a3d8f6d44
GitHub-Pull-Request: golang/go#45242
Reviewed-on: https://go-review.googlesource.com/c/go/+/305069
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index a866a96..5de1e1e 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -623,6 +623,14 @@
 // Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b)
 (OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y)
 (XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y)
+(ORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
+	(BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(ORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
+	(BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+(XORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
+	(BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(XORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
+	(BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 
 // Convert ORconst into BTS, if the code gets smaller, with boundary being
 // (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes).
@@ -645,6 +653,10 @@
     => (BTRQconst [int8(log64(^c))] x)
 (ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
     => (BTRLconst [int8(log32(^c))] x)
+(ANDLmodify [off] {sym} ptr (NOTL s:(SHLL (MOVLconst [1]) <t> x)) mem) =>
+	(BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(ANDQmodify [off] {sym} ptr (NOTQ s:(SHLQ (MOVQconst [1]) <t> x)) mem) =>
+	(BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 
 // Special-case bit patterns on first/last bit.
 // generic.rules changes ANDs of high-part/low-part masks into a couple of shifts,
@@ -2050,11 +2062,15 @@
 ((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
 ((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
 (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
-(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
-	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((BTC|BTR|BTS)L l:(MOVLload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((BTC|BTR|BTS)Lmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
 (MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
-(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
-	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((BTC|BTR|BTS)Q l:(MOVQload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((BTC|BTR|BTS)Qmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 
 // Merge ADDQconst and LEAQ into atomic loads.
 (MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index de53726..a87581b 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -358,6 +358,11 @@
 		{name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64
 
 		// direct bit operation on memory operand
+		//
+		// Note that these operations do not mask the bit offset (arg1), and will write beyond their expected
+		// bounds if that argument is larger than 64/32 (for BT*Q and BT*L, respectively). If the compiler
+		// cannot prove that arg1 is in range, it must be explicitly masked (see e.g. the patterns that produce
+		// BT*modify from (MOVstore (BT* (MOVLload ptr mem) x) mem)).
 		{name: "BTCQmodify", argLength: 3, reg: gpstore, asm: "BTCQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // complement bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
 		{name: "BTCLmodify", argLength: 3, reg: gpstore, asm: "BTCL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // complement bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
 		{name: "BTSQmodify", argLength: 3, reg: gpstore, asm: "BTSQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // set bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 75d4ff7..8272a40 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2998,6 +2998,36 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ANDLmodify [off] {sym} ptr (NOTL s:(SHLL (MOVLconst [1]) <t> x)) mem)
+	// result: (BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpAMD64NOTL {
+			break
+		}
+		s := v_1.Args[0]
+		if s.Op != OpAMD64SHLL {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVLconst || auxIntToInt32(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTRLmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (ANDLmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (ANDLmodify [off1+off2] {sym} base val mem)
@@ -3377,6 +3407,36 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ANDQmodify [off] {sym} ptr (NOTQ s:(SHLQ (MOVQconst [1]) <t> x)) mem)
+	// result: (BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpAMD64NOTQ {
+			break
+		}
+		s := v_1.Args[0]
+		if s.Op != OpAMD64SHLQ {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVQconst || auxIntToInt64(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTRQmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (ANDQmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (ANDQmodify [off1+off2] {sym} base val mem)
@@ -12709,9 +12769,9 @@
 		}
 		break
 	}
-	// match: (MOVLstore {sym} [off] ptr y:(BTCL l:(MOVLload [off] {sym} ptr mem) x) mem)
+	// match: (MOVLstore {sym} [off] ptr y:(BTCL l:(MOVLload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTCLmodify [off] {sym} ptr x mem)
+	// result: (BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -12720,6 +12780,7 @@
 		if y.Op != OpAMD64BTCL {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVLload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -12732,12 +12793,15 @@
 		v.reset(OpAMD64BTCLmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
-	// match: (MOVLstore {sym} [off] ptr y:(BTRL l:(MOVLload [off] {sym} ptr mem) x) mem)
+	// match: (MOVLstore {sym} [off] ptr y:(BTRL l:(MOVLload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTRLmodify [off] {sym} ptr x mem)
+	// result: (BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -12746,6 +12810,7 @@
 		if y.Op != OpAMD64BTRL {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVLload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -12758,12 +12823,15 @@
 		v.reset(OpAMD64BTRLmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
-	// match: (MOVLstore {sym} [off] ptr y:(BTSL l:(MOVLload [off] {sym} ptr mem) x) mem)
+	// match: (MOVLstore {sym} [off] ptr y:(BTSL l:(MOVLload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTSLmodify [off] {sym} ptr x mem)
+	// result: (BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -12772,6 +12840,7 @@
 		if y.Op != OpAMD64BTSL {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVLload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -12784,7 +12853,10 @@
 		v.reset(OpAMD64BTSLmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 	// match: (MOVLstore [off] {sym} ptr a:(ADDLconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem)
@@ -13525,6 +13597,7 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
 	// match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (MOVQstore [off1+off2] {sym} ptr val mem)
@@ -13890,9 +13963,9 @@
 		}
 		break
 	}
-	// match: (MOVQstore {sym} [off] ptr y:(BTCQ l:(MOVQload [off] {sym} ptr mem) x) mem)
+	// match: (MOVQstore {sym} [off] ptr y:(BTCQ l:(MOVQload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTCQmodify [off] {sym} ptr x mem)
+	// result: (BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -13901,6 +13974,7 @@
 		if y.Op != OpAMD64BTCQ {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVQload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -13913,12 +13987,15 @@
 		v.reset(OpAMD64BTCQmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
-	// match: (MOVQstore {sym} [off] ptr y:(BTRQ l:(MOVQload [off] {sym} ptr mem) x) mem)
+	// match: (MOVQstore {sym} [off] ptr y:(BTRQ l:(MOVQload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTRQmodify [off] {sym} ptr x mem)
+	// result: (BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -13927,6 +14004,7 @@
 		if y.Op != OpAMD64BTRQ {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVQload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -13939,12 +14017,15 @@
 		v.reset(OpAMD64BTRQmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
-	// match: (MOVQstore {sym} [off] ptr y:(BTSQ l:(MOVQload [off] {sym} ptr mem) x) mem)
+	// match: (MOVQstore {sym} [off] ptr y:(BTSQ l:(MOVQload [off] {sym} ptr mem) <t> x) mem)
 	// cond: y.Uses==1 && l.Uses==1 && clobber(y, l)
-	// result: (BTSQmodify [off] {sym} ptr x mem)
+	// result: (BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		sym := auxToSym(v.Aux)
@@ -13953,6 +14034,7 @@
 		if y.Op != OpAMD64BTSQ {
 			break
 		}
+		t := y.Type
 		x := y.Args[1]
 		l := y.Args[0]
 		if l.Op != OpAMD64MOVQload || auxIntToInt32(l.AuxInt) != off || auxToSym(l.Aux) != sym {
@@ -13965,7 +14047,10 @@
 		v.reset(OpAMD64BTSQmodify)
 		v.AuxInt = int32ToAuxInt(off)
 		v.Aux = symToAux(sym)
-		v.AddArg3(ptr, x, mem)
+		v0 := b.NewValue0(l.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 	// match: (MOVQstore [off] {sym} ptr a:(ADDQconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem)
@@ -18352,6 +18437,33 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem)
+	// result: (BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		s := v_1
+		if s.Op != OpAMD64SHLL {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVLconst || auxIntToInt32(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTSLmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (ORLmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (ORLmodify [off1+off2] {sym} base val mem)
@@ -19979,6 +20091,33 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem)
+	// result: (BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		s := v_1
+		if s.Op != OpAMD64SHLQ {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVQconst || auxIntToInt64(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTSQmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (ORQmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (ORQmodify [off1+off2] {sym} base val mem)
@@ -28014,6 +28153,33 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (XORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem)
+	// result: (BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		s := v_1
+		if s.Op != OpAMD64SHLL {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVLconst || auxIntToInt32(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTCLmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDLconst, t)
+		v0.AuxInt = int32ToAuxInt(31)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (XORLmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (XORLmodify [off1+off2] {sym} base val mem)
@@ -28382,6 +28548,33 @@
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	// match: (XORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem)
+	// result: (BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		s := v_1
+		if s.Op != OpAMD64SHLQ {
+			break
+		}
+		t := s.Type
+		x := s.Args[1]
+		s_0 := s.Args[0]
+		if s_0.Op != OpAMD64MOVQconst || auxIntToInt64(s_0.AuxInt) != 1 {
+			break
+		}
+		mem := v_2
+		v.reset(OpAMD64BTCQmodify)
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
+		v0 := b.NewValue0(v.Pos, OpAMD64ANDQconst, t)
+		v0.AuxInt = int32ToAuxInt(63)
+		v0.AddArg(x)
+		v.AddArg3(ptr, v0, mem)
+		return true
+	}
 	// match: (XORQmodify [off1] {sym} (ADDQconst [off2] base) val mem)
 	// cond: is32Bit(int64(off1)+int64(off2))
 	// result: (XORQmodify [off1+off2] {sym} base val mem)
diff --git a/test/codegen/bits.go b/test/codegen/bits.go
index 4508eba..6f75022 100644
--- a/test/codegen/bits.go
+++ b/test/codegen/bits.go
@@ -262,8 +262,8 @@
 	return n
 }
 
-// check direct operation on memory with constant source
-func bitOpOnMem(a []uint32) {
+// check direct operation on memory with constant and shifted constant sources
+func bitOpOnMem(a []uint32, b, c, d uint32) {
 	// amd64:`ANDL\s[$]200,\s\([A-Z]+\)`
 	a[0] &= 200
 	// amd64:`ORL\s[$]220,\s4\([A-Z]+\)`
@@ -276,6 +276,12 @@
 	a[4] |= 0x4000
 	// amd64:`BTCL\s[$]13,\s20\([A-Z]+\)`,-`XORL`
 	a[5] ^= 0x2000
+	// amd64:`BTRL\s[A-Z]+,\s24\([A-Z]+\)`
+	a[6] &^= 1 << (b & 31)
+	// amd64:`BTSL\s[A-Z]+,\s28\([A-Z]+\)`
+	a[7] |= 1 << (c & 31)
+	// amd64:`BTCL\s[A-Z]+,\s32\([A-Z]+\)`
+	a[8] ^= 1 << (d & 31)
 }
 
 func bitcheckMostNegative(b uint8) bool {