cmd/compile: improve atomic swap intrinsics on arm64

ARMv8.1 has added new instructions for atomic memory operations. This
change builds on the previous change which added support for atomic add,
0a7ac93c27c9ade79fe0f66ae0bb81484c241ae5, to include similar support for
atomic-compare-and-swap, atomic-swap, atomic-or, and atomic-and
intrinsics. Since the new instructions are not guaranteed to be present,
we guard their usages with a branch on a CPU feature.

Peformance on an ARMv8.1 machine:
name                 old time/op  new time/op  delta
CompareAndSwap-16    37.9ns ±16%  24.1ns ± 4%  -36.44%  (p=0.000 n=10+9)
CompareAndSwap64-16  38.6ns ±15%  24.1ns ± 3%  -37.47%  (p=0.000 n=10+10)

name       old time/op  new time/op  delta
Swap-16    46.9ns ±32%  12.5ns ± 6%  -73.40%  (p=0.000 n=10+10)
Swap64-16  53.4ns ± 1%  12.5ns ± 6%  -76.56%  (p=0.000 n=10+10)

name            old time/op  new time/op  delta
Or8-16          8.81ns ± 0%  5.61ns ± 0%  -36.32%  (p=0.000 n=10+10)
Or-16           7.21ns ± 0%  5.61ns ± 0%  -22.19%  (p=0.000 n=10+10)
Or8Parallel-16  59.8ns ± 3%  12.5ns ± 2%  -79.10%  (p=0.000 n=10+10)
OrParallel-16   51.7ns ± 3%  12.5ns ± 2%  -75.84%  (p=0.000 n=10+10)

name             old time/op  new time/op  delta
And8-16          8.81ns ± 0%  5.61ns ± 0%  -36.32%  (p=0.000 n=10+10)
And-16           7.21ns ± 0%  5.61ns ± 0%  -22.19%  (p=0.000 n=10+10)
And8Parallel-16  59.1ns ± 6%  12.8ns ± 3%  -78.33%  (p=0.000 n=10+10)
AndParallel-16   51.4ns ± 7%  12.8ns ± 3%  -75.03%  (p=0.000 n=10+10)

Performance on an ARMv8.0 machine (no atomics instructions):
name                 old time/op  new time/op  delta
CompareAndSwap-16    61.3ns ± 0%  62.4ns ± 0%  +1.70%  (p=0.000 n=8+9)
CompareAndSwap64-16  62.0ns ± 3%  61.3ns ± 2%    ~     (p=0.093 n=10+10)

name       old time/op  new time/op  delta
Swap-16     127ns ± 2%   131ns ± 2%  +2.91%  (p=0.001 n=10+10)
Swap64-16   128ns ± 1%   131ns ± 2%  +2.43%  (p=0.001 n=10+10)

name            old time/op  new time/op  delta
Or8-16          14.9ns ± 0%  15.3ns ± 0%  +2.68%  (p=0.000 n=10+10)
Or-16           11.8ns ± 0%  12.3ns ± 0%  +4.24%  (p=0.000 n=10+10)
Or8Parallel-16   137ns ± 1%   144ns ± 1%  +4.97%  (p=0.000 n=10+10)
OrParallel-16    128ns ± 1%   136ns ± 1%  +6.34%  (p=0.000 n=10+10)

name             old time/op  new time/op  delta
And8-16          14.9ns ± 0%  15.3ns ± 0%  +2.68%  (p=0.000 n=10+10)
And-16           11.8ns ± 0%  12.3ns ± 0%  +4.24%  (p=0.000 n=10+10)
And8Parallel-16   134ns ± 2%   141ns ± 1%  +5.29%  (p=0.000 n=10+10)
AndParallel-16    125ns ± 2%   134ns ± 1%  +7.10%  (p=0.000 n=10+10)

Fixes #39304

Change-Id: Idaca68701d4751650be6b4bedca3d57f51571712
Reviewed-on: https://go-review.googlesource.com/c/go/+/234217
Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: fannie zhang <Fannie.Zhang@arm.com>
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 5c695ef..22b28a9 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -581,6 +581,24 @@
 		p2.From.Reg = arm64.REGTMP
 		p2.To.Type = obj.TYPE_BRANCH
 		gc.Patch(p2, p)
+	case ssa.OpARM64LoweredAtomicExchange64Variant,
+		ssa.OpARM64LoweredAtomicExchange32Variant:
+		swap := arm64.ASWPALD
+		if v.Op == ssa.OpARM64LoweredAtomicExchange32Variant {
+			swap = arm64.ASWPALW
+		}
+		r0 := v.Args[0].Reg()
+		r1 := v.Args[1].Reg()
+		out := v.Reg0()
+
+		// SWPALD	Rarg1, (Rarg0), Rout
+		p := s.Prog(swap)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r1
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = r0
+		p.RegTo2 = out
+
 	case ssa.OpARM64LoweredAtomicAdd64,
 		ssa.OpARM64LoweredAtomicAdd32:
 		// LDAXR	(Rarg0), Rout
@@ -687,6 +705,56 @@
 		p5.To.Type = obj.TYPE_REG
 		p5.To.Reg = out
 		gc.Patch(p2, p5)
+	case ssa.OpARM64LoweredAtomicCas64Variant,
+		ssa.OpARM64LoweredAtomicCas32Variant:
+		// Rarg0: ptr
+		// Rarg1: old
+		// Rarg2: new
+		// MOV  	Rarg1, Rtmp
+		// CASAL	Rtmp, (Rarg0), Rarg2
+		// CMP  	Rarg1, Rtmp
+		// CSET 	EQ, Rout
+		cas := arm64.ACASALD
+		cmp := arm64.ACMP
+		mov := arm64.AMOVD
+		if v.Op == ssa.OpARM64LoweredAtomicCas32Variant {
+			cas = arm64.ACASALW
+			cmp = arm64.ACMPW
+			mov = arm64.AMOVW
+		}
+		r0 := v.Args[0].Reg()
+		r1 := v.Args[1].Reg()
+		r2 := v.Args[2].Reg()
+		out := v.Reg0()
+
+		// MOV  	Rarg1, Rtmp
+		p := s.Prog(mov)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+
+		// CASAL	Rtmp, (Rarg0), Rarg2
+		p1 := s.Prog(cas)
+		p1.From.Type = obj.TYPE_REG
+		p1.From.Reg = arm64.REGTMP
+		p1.To.Type = obj.TYPE_MEM
+		p1.To.Reg = r0
+		p1.RegTo2 = r2
+
+		// CMP  	Rarg1, Rtmp
+		p2 := s.Prog(cmp)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = r1
+		p2.Reg = arm64.REGTMP
+
+		// CSET 	EQ, Rout
+		p3 := s.Prog(arm64.ACSET)
+		p3.From.Type = obj.TYPE_REG
+		p3.From.Reg = arm64.COND_EQ
+		p3.To.Type = obj.TYPE_REG
+		p3.To.Reg = out
+
 	case ssa.OpARM64LoweredAtomicAnd8,
 		ssa.OpARM64LoweredAtomicAnd32,
 		ssa.OpARM64LoweredAtomicOr8,
@@ -725,6 +793,63 @@
 		p3.From.Reg = arm64.REGTMP
 		p3.To.Type = obj.TYPE_BRANCH
 		gc.Patch(p3, p)
+	case ssa.OpARM64LoweredAtomicAnd8Variant,
+		ssa.OpARM64LoweredAtomicAnd32Variant:
+		atomic_clear := arm64.ALDCLRALW
+		if v.Op == ssa.OpARM64LoweredAtomicAnd8Variant {
+			atomic_clear = arm64.ALDCLRALB
+		}
+		r0 := v.Args[0].Reg()
+		r1 := v.Args[1].Reg()
+		out := v.Reg0()
+
+		// MNV       Rarg1 Rtemp
+		p := s.Prog(arm64.AMVN)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+
+		// LDCLRALW  Rtemp, (Rarg0), Rout
+		p1 := s.Prog(atomic_clear)
+		p1.From.Type = obj.TYPE_REG
+		p1.From.Reg = arm64.REGTMP
+		p1.To.Type = obj.TYPE_MEM
+		p1.To.Reg = r0
+		p1.RegTo2 = out
+
+		// AND       Rarg1, Rout
+		p2 := s.Prog(arm64.AAND)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = r1
+		p2.To.Type = obj.TYPE_REG
+		p2.To.Reg = out
+
+	case ssa.OpARM64LoweredAtomicOr8Variant,
+		ssa.OpARM64LoweredAtomicOr32Variant:
+		atomic_or := arm64.ALDORALW
+		if v.Op == ssa.OpARM64LoweredAtomicOr8Variant {
+			atomic_or = arm64.ALDORALB
+		}
+		r0 := v.Args[0].Reg()
+		r1 := v.Args[1].Reg()
+		out := v.Reg0()
+
+		// LDORALW  Rarg1, (Rarg0), Rout
+		p := s.Prog(atomic_or)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r1
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = r0
+		p.RegTo2 = out
+
+		// ORR       Rarg1, Rout
+		p2 := s.Prog(arm64.AORR)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = r1
+		p2.To.Type = obj.TYPE_REG
+		p2.To.Reg = out
+
 	case ssa.OpARM64MOVBreg,
 		ssa.OpARM64MOVBUreg,
 		ssa.OpARM64MOVHreg,
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 6748490..0b38e70 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3458,14 +3458,64 @@
 			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
 			return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
 	addF("runtime/internal/atomic", "Xchg64",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], args[1], s.mem())
 			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
 			return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+	type atomicOpEmitter func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType)
+
+	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ, rtyp types.EType, emit atomicOpEmitter) intrinsicBuilder {
+
+		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+			// Target Atomic feature is identified by dynamic detection
+			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64HasATOMICS, s.sb)
+			v := s.load(types.Types[TBOOL], addr)
+			b := s.endBlock()
+			b.Kind = ssa.BlockIf
+			b.SetControl(v)
+			bTrue := s.f.NewBlock(ssa.BlockPlain)
+			bFalse := s.f.NewBlock(ssa.BlockPlain)
+			bEnd := s.f.NewBlock(ssa.BlockPlain)
+			b.AddEdgeTo(bTrue)
+			b.AddEdgeTo(bFalse)
+			b.Likely = ssa.BranchLikely
+
+			// We have atomic instructions - use it directly.
+			s.startBlock(bTrue)
+			emit(s, n, args, op1, typ)
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Use original instruction sequence.
+			s.startBlock(bFalse)
+			emit(s, n, args, op0, typ)
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Merge results.
+			s.startBlock(bEnd)
+			if rtyp == TNIL {
+				return nil
+			} else {
+				return s.variable(n, types.Types[rtyp])
+			}
+		}
+	}
+
+	atomicXchgXaddEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
+		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
+		s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+		s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+	}
+	addF("runtime/internal/atomic", "Xchg",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, TUINT32, TUINT32, atomicXchgXaddEmitterARM64),
+		sys.ARM64)
+	addF("runtime/internal/atomic", "Xchg64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, TUINT64, TUINT64, atomicXchgXaddEmitterARM64),
+		sys.ARM64)
 
 	addF("runtime/internal/atomic", "Xadd",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@@ -3482,46 +3532,11 @@
 		},
 		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
 
-	makeXaddARM64 := func(op0 ssa.Op, op1 ssa.Op, ty types.EType) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-			// Target Atomic feature is identified by dynamic detection
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64HasATOMICS, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
-			b := s.endBlock()
-			b.Kind = ssa.BlockIf
-			b.SetControl(v)
-			bTrue := s.f.NewBlock(ssa.BlockPlain)
-			bFalse := s.f.NewBlock(ssa.BlockPlain)
-			bEnd := s.f.NewBlock(ssa.BlockPlain)
-			b.AddEdgeTo(bTrue)
-			b.AddEdgeTo(bFalse)
-			b.Likely = ssa.BranchUnlikely // most machines don't have Atomics nowadays
-
-			// We have atomic instructions - use it directly.
-			s.startBlock(bTrue)
-			v0 := s.newValue3(op1, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v0)
-			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v0)
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Use original instruction sequence.
-			s.startBlock(bFalse)
-			v1 := s.newValue3(op0, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v1)
-			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v1)
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Merge results.
-			s.startBlock(bEnd)
-			return s.variable(n, types.Types[ty])
-		}
-	}
-
 	addF("runtime/internal/atomic", "Xadd",
-		makeXaddARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32),
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32, TUINT32, atomicXchgXaddEmitterARM64),
 		sys.ARM64)
 	addF("runtime/internal/atomic", "Xadd64",
-		makeXaddARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64),
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64, TUINT64, atomicXchgXaddEmitterARM64),
 		sys.ARM64)
 
 	addF("runtime/internal/atomic", "Cas",
@@ -3530,14 +3545,14 @@
 			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
 			return s.newValue1(ssa.OpSelect0, types.Types[TBOOL], v)
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
 	addF("runtime/internal/atomic", "Cas64",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
 			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
 			return s.newValue1(ssa.OpSelect0, types.Types[TBOOL], v)
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
 	addF("runtime/internal/atomic", "CasRel",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
@@ -3546,18 +3561,31 @@
 		},
 		sys.PPC64)
 
+	atomicCasEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
+		v := s.newValue4(op, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+		s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+		s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+	}
+
+	addF("runtime/internal/atomic", "Cas",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, TUINT32, TBOOL, atomicCasEmitterARM64),
+		sys.ARM64)
+	addF("runtime/internal/atomic", "Cas64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, TUINT64, TBOOL, atomicCasEmitterARM64),
+		sys.ARM64)
+
 	addF("runtime/internal/atomic", "And8",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
 			return nil
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
+		sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
 	addF("runtime/internal/atomic", "And",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
 			return nil
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
+		sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
 	addF("runtime/internal/atomic", "Or8",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
@@ -3569,7 +3597,24 @@
 			s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
 			return nil
 		},
-		sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
+		sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
+
+	atomicAndOrEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
+		s.vars[&memVar] = s.newValue3(op, types.TypeMem, args[0], args[1], s.mem())
+	}
+
+	addF("runtime/internal/atomic", "And8",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd8, ssa.OpAtomicAnd8Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
+		sys.ARM64)
+	addF("runtime/internal/atomic", "And",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32, ssa.OpAtomicAnd32Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
+		sys.ARM64)
+	addF("runtime/internal/atomic", "Or8",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr8, ssa.OpAtomicOr8Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
+		sys.ARM64)
+	addF("runtime/internal/atomic", "Or",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32, ssa.OpAtomicOr32Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
+		sys.ARM64)
 
 	alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...)
 	alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...)
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index 7e014fe..9edc0c9 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -543,17 +543,24 @@
 (AtomicStore64      ...) => (STLR  ...)
 (AtomicStorePtrNoWB ...) => (STLR  ...)
 
-(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
-(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+(AtomicExchange(32|64)       ...) => (LoweredAtomicExchange(32|64) ...)
+(AtomicAdd(32|64)            ...) => (LoweredAtomicAdd(32|64) ...)
 (AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
 
+(AtomicAdd(32|64)Variant            ...) => (LoweredAtomicAdd(32|64)Variant      ...)
+(AtomicExchange(32|64)Variant       ...) => (LoweredAtomicExchange(32|64)Variant ...)
+(AtomicCompareAndSwap(32|64)Variant ...) => (LoweredAtomicCas(32|64)Variant      ...)
+
 // Currently the updated value is not used, but we need a register to temporarily hold it.
 (AtomicAnd8  ptr val mem) => (Select1 (LoweredAtomicAnd8  ptr val mem))
 (AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem))
 (AtomicOr8   ptr val mem) => (Select1 (LoweredAtomicOr8   ptr val mem))
 (AtomicOr32  ptr val mem) => (Select1 (LoweredAtomicOr32  ptr val mem))
 
-(AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...)
+(AtomicAnd8Variant  ptr val mem) => (Select1 (LoweredAtomicAnd8Variant  ptr val mem))
+(AtomicAnd32Variant ptr val mem) => (Select1 (LoweredAtomicAnd32Variant ptr val mem))
+(AtomicOr8Variant   ptr val mem) => (Select1 (LoweredAtomicOr8Variant   ptr val mem))
+(AtomicOr32Variant  ptr val mem) => (Select1 (LoweredAtomicOr32Variant  ptr val mem))
 
 // Write barrier.
 (WB ...) => (LoweredWB ...)
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
index fe9edbf..87db2b7 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -621,6 +621,12 @@
 		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 
+		// atomic exchange variant.
+		// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+		// SWPALD	Rarg1, (Rarg0), Rout
+		{name: "LoweredAtomicExchange64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicExchange32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
 		// atomic add.
 		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
 		// LDAXR	(Rarg0), Rout
@@ -654,6 +660,21 @@
 		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 
+		// atomic compare and swap variant.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// MOV  	Rarg1, Rtmp
+		// CASAL	Rtmp, (Rarg0), Rarg2
+		// CMP  	Rarg1, Rtmp
+		// CSET 	EQ, Rout
+		{name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
 		// atomic and/or.
 		// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
 		// LDAXR	(Rarg0), Rout
@@ -665,6 +686,20 @@
 		{name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 		{name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 
+		// atomic and/or variant.
+		// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+		//   AND:
+		// MNV       Rarg1, Rtemp
+		// LDANDALB  Rtemp, (Rarg0), Rout
+		// AND       Rarg1, Rout
+		//   OR:
+		// LDORALB  Rarg1, (Rarg0), Rout
+		// ORR       Rarg1, Rout
+		{name: "LoweredAtomicAnd8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAnd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicOr32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+
 		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
 		// It saves all GP registers if necessary,
 		// but clobbers R30 (LR) because it's a call.
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
index db8d7ba..9565199 100644
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -574,8 +574,16 @@
 	// These variants have the same semantics as above atomic operations.
 	// But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
 	// Currently, they are used on ARM64 only.
-	{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
-	{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},          // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},          // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicExchange32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},     // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicExchange64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},     // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicCompareAndSwap32Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicCompareAndSwap64Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicAnd8Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                    // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicAnd32Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                    // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr8Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                     // *arg0 |= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr32Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                     // *arg0 |= arg1.  arg2=memory.  Returns memory.
 
 	// Clobber experiment op
 	{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 25c1df1..c0b663c 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1581,16 +1581,24 @@
 	OpARM64STLRW
 	OpARM64LoweredAtomicExchange64
 	OpARM64LoweredAtomicExchange32
+	OpARM64LoweredAtomicExchange64Variant
+	OpARM64LoweredAtomicExchange32Variant
 	OpARM64LoweredAtomicAdd64
 	OpARM64LoweredAtomicAdd32
 	OpARM64LoweredAtomicAdd64Variant
 	OpARM64LoweredAtomicAdd32Variant
 	OpARM64LoweredAtomicCas64
 	OpARM64LoweredAtomicCas32
+	OpARM64LoweredAtomicCas64Variant
+	OpARM64LoweredAtomicCas32Variant
 	OpARM64LoweredAtomicAnd8
 	OpARM64LoweredAtomicAnd32
 	OpARM64LoweredAtomicOr8
 	OpARM64LoweredAtomicOr32
+	OpARM64LoweredAtomicAnd8Variant
+	OpARM64LoweredAtomicAnd32Variant
+	OpARM64LoweredAtomicOr8Variant
+	OpARM64LoweredAtomicOr32Variant
 	OpARM64LoweredWB
 	OpARM64LoweredPanicBoundsA
 	OpARM64LoweredPanicBoundsB
@@ -2881,6 +2889,14 @@
 	OpAtomicOr32
 	OpAtomicAdd32Variant
 	OpAtomicAdd64Variant
+	OpAtomicExchange32Variant
+	OpAtomicExchange64Variant
+	OpAtomicCompareAndSwap32Variant
+	OpAtomicCompareAndSwap64Variant
+	OpAtomicAnd8Variant
+	OpAtomicAnd32Variant
+	OpAtomicOr8Variant
+	OpAtomicOr32Variant
 	OpClobber
 )
 
@@ -20995,6 +21011,38 @@
 		},
 	},
 	{
+		name:            "LoweredAtomicExchange64Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:            "LoweredAtomicExchange32Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
 		name:            "LoweredAtomicAdd64",
 		argLen:          3,
 		resultNotInArgs: true,
@@ -21099,6 +21147,44 @@
 		},
 	},
 	{
+		name:            "LoweredAtomicCas64Variant",
+		argLen:          4,
+		resultNotInArgs: true,
+		clobberFlags:    true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		unsafePoint:     true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{2, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:            "LoweredAtomicCas32Variant",
+		argLen:          4,
+		resultNotInArgs: true,
+		clobberFlags:    true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		unsafePoint:     true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{2, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
 		name:            "LoweredAtomicAnd8",
 		argLen:          3,
 		resultNotInArgs: true,
@@ -21171,6 +21257,72 @@
 		},
 	},
 	{
+		name:            "LoweredAtomicAnd8Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		unsafePoint:     true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:            "LoweredAtomicAnd32Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		unsafePoint:     true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:            "LoweredAtomicOr8Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:            "LoweredAtomicOr32Variant",
+		argLen:          3,
+		resultNotInArgs: true,
+		faultOnNilArg0:  true,
+		hasSideEffects:  true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
 		name:         "LoweredWB",
 		auxType:      auxSym,
 		argLen:       3,
@@ -35875,6 +36027,54 @@
 		generic:        true,
 	},
 	{
+		name:           "AtomicExchange32Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicExchange64Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicCompareAndSwap32Variant",
+		argLen:         4,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicCompareAndSwap64Variant",
+		argLen:         4,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicAnd8Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicAnd32Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicOr8Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
+		name:           "AtomicOr32Variant",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
+	{
 		name:      "Clobber",
 		auxType:   auxSymOff,
 		argLen:    0,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 9a5e976..353696b 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -426,20 +426,36 @@
 		return true
 	case OpAtomicAnd32:
 		return rewriteValueARM64_OpAtomicAnd32(v)
+	case OpAtomicAnd32Variant:
+		return rewriteValueARM64_OpAtomicAnd32Variant(v)
 	case OpAtomicAnd8:
 		return rewriteValueARM64_OpAtomicAnd8(v)
+	case OpAtomicAnd8Variant:
+		return rewriteValueARM64_OpAtomicAnd8Variant(v)
 	case OpAtomicCompareAndSwap32:
 		v.Op = OpARM64LoweredAtomicCas32
 		return true
+	case OpAtomicCompareAndSwap32Variant:
+		v.Op = OpARM64LoweredAtomicCas32Variant
+		return true
 	case OpAtomicCompareAndSwap64:
 		v.Op = OpARM64LoweredAtomicCas64
 		return true
+	case OpAtomicCompareAndSwap64Variant:
+		v.Op = OpARM64LoweredAtomicCas64Variant
+		return true
 	case OpAtomicExchange32:
 		v.Op = OpARM64LoweredAtomicExchange32
 		return true
+	case OpAtomicExchange32Variant:
+		v.Op = OpARM64LoweredAtomicExchange32Variant
+		return true
 	case OpAtomicExchange64:
 		v.Op = OpARM64LoweredAtomicExchange64
 		return true
+	case OpAtomicExchange64Variant:
+		v.Op = OpARM64LoweredAtomicExchange64Variant
+		return true
 	case OpAtomicLoad32:
 		v.Op = OpARM64LDARW
 		return true
@@ -454,8 +470,12 @@
 		return true
 	case OpAtomicOr32:
 		return rewriteValueARM64_OpAtomicOr32(v)
+	case OpAtomicOr32Variant:
+		return rewriteValueARM64_OpAtomicOr32Variant(v)
 	case OpAtomicOr8:
 		return rewriteValueARM64_OpAtomicOr8(v)
+	case OpAtomicOr8Variant:
+		return rewriteValueARM64_OpAtomicOr8Variant(v)
 	case OpAtomicStore32:
 		v.Op = OpARM64STLRW
 		return true
@@ -21363,6 +21383,25 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpAtomicAnd32Variant(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (AtomicAnd32Variant ptr val mem)
+	// result: (Select1 (LoweredAtomicAnd32Variant ptr val mem))
+	for {
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		v.reset(OpSelect1)
+		v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicAnd32Variant, types.NewTuple(typ.UInt32, types.TypeMem))
+		v0.AddArg3(ptr, val, mem)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueARM64_OpAtomicAnd8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -21382,6 +21421,25 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpAtomicAnd8Variant(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (AtomicAnd8Variant ptr val mem)
+	// result: (Select1 (LoweredAtomicAnd8Variant ptr val mem))
+	for {
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		v.reset(OpSelect1)
+		v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicAnd8Variant, types.NewTuple(typ.UInt8, types.TypeMem))
+		v0.AddArg3(ptr, val, mem)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueARM64_OpAtomicOr32(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -21401,6 +21459,25 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpAtomicOr32Variant(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (AtomicOr32Variant ptr val mem)
+	// result: (Select1 (LoweredAtomicOr32Variant ptr val mem))
+	for {
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		v.reset(OpSelect1)
+		v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicOr32Variant, types.NewTuple(typ.UInt32, types.TypeMem))
+		v0.AddArg3(ptr, val, mem)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueARM64_OpAtomicOr8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -21420,6 +21497,25 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpAtomicOr8Variant(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (AtomicOr8Variant ptr val mem)
+	// result: (Select1 (LoweredAtomicOr8Variant ptr val mem))
+	for {
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		v.reset(OpSelect1)
+		v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicOr8Variant, types.NewTuple(typ.UInt8, types.TypeMem))
+		v0.AddArg3(ptr, val, mem)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueARM64_OpAvg64u(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index 434aa6d..2476c06 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -142,3 +142,54 @@
 		}
 	})
 }
+
+func BenchmarkCas(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas(ptr, 1, 0)
+			atomic.Cas(ptr, 0, 1)
+		}
+	})
+}
+
+func BenchmarkCas64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas64(ptr, 1, 0)
+			atomic.Cas64(ptr, 0, 1)
+		}
+	})
+}
+func BenchmarkXchg(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint32
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg(ptr, y)
+			y += 1
+		}
+	})
+}
+
+func BenchmarkXchg64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint64
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg64(ptr, y)
+			y += 1
+		}
+	})
+}