cmd/compile,sync/atomic: make Add And & Or SQCST on PPC64

Fixes #79186

Change-Id: If7e298270ac6252b092371725d6a96aa871bf919
Reviewed-on: https://go-review.googlesource.com/c/go/+/774020
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com>
Auto-Submit: Jorropo <jorropo.pgm@gmail.com>
Reviewed-by: Paul Murphy <paumurph@redhat.com>
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index b2358a9..a0d81d3 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -135,6 +135,7 @@
 		// AND/OR	Rarg1, Rtmp
 		// STBCCC/STWCCC Rtmp, (Rarg0)
 		// BNE		-3(PC)
+		// LWSYNC
 		ld := ppc64.ALBAR
 		st := ppc64.ASTBCCC
 		if v.Op == ssa.OpPPC64LoweredAtomicAnd32 || v.Op == ssa.OpPPC64LoweredAtomicOr32 {
@@ -170,6 +171,10 @@
 		p3 := s.Prog(ppc64.ABNE)
 		p3.To.Type = obj.TYPE_BRANCH
 		p3.To.SetTarget(p)
+		// LWSYNC - Provide acquire ordering to pair with the
+		// release (pre-LWSYNC) above, making the operation
+		// sequentially consistent.
+		s.Prog(ppc64.ALWSYNC)
 
 	case ssa.OpPPC64LoweredAtomicAdd32,
 		ssa.OpPPC64LoweredAtomicAdd64:
@@ -179,6 +184,7 @@
 		// STDCCC/STWCCC Rout, (Rarg0)
 		// BNE         -3(PC)
 		// MOVW		Rout,Rout (if Add32)
+		// LWSYNC
 		ld := ppc64.ALDAR
 		st := ppc64.ASTDCCC
 		if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
@@ -188,10 +194,10 @@
 		r0 := v.Args[0].Reg()
 		r1 := v.Args[1].Reg()
 		out := v.Reg0()
-		// LWSYNC - Assuming shared data not write-through-required nor
-		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
-		plwsync := s.Prog(ppc64.ALWSYNC)
-		plwsync.To.Type = obj.TYPE_NONE
+		// LWSYNC - Provide acquire ordering to pair with the
+		// release (pre-LWSYNC) above, making the operation
+		// sequentially consistent.
+		s.Prog(ppc64.ALWSYNC)
 		// LDAR or LWAR
 		p := s.Prog(ld)
 		p.From.Type = obj.TYPE_MEM
@@ -223,6 +229,11 @@
 			p5.From.Type = obj.TYPE_REG
 			p5.From.Reg = out
 		}
+		// LWSYNC - Provide acquire ordering to pair with the
+		// release (pre-LWSYNC) above, making the operation
+		// sequentially consistent.
+		plwsync2 := s.Prog(ppc64.ALWSYNC)
+		plwsync2.To.Type = obj.TYPE_NONE
 
 	case ssa.OpPPC64LoweredAtomicExchange8,
 		ssa.OpPPC64LoweredAtomicExchange32,
diff --git a/src/internal/runtime/atomic/atomic_ppc64x.s b/src/internal/runtime/atomic/atomic_ppc64x.s
index bff7d19..a82a34e 100644
--- a/src/internal/runtime/atomic/atomic_ppc64x.s
+++ b/src/internal/runtime/atomic/atomic_ppc64x.s
@@ -220,6 +220,7 @@
 	ADD	R5, R3
 	STWCCC	R3, (R4)
 	BNE	-3(PC)
+	LWSYNC
 	MOVW	R3, ret+16(FP)
 	RET
 
@@ -235,6 +236,7 @@
 	ADD	R5, R3
 	STDCCC	R3, (R4)
 	BNE	-3(PC)
+	LWSYNC
 	MOVD	R3, ret+16(FP)
 	RET
 
@@ -343,6 +345,7 @@
 	OR	R4, R6
 	STBCCC	R6, (R3)
 	BNE	again
+	LWSYNC
 	RET
 
 // void ·And8(byte volatile*, byte);
@@ -355,6 +358,7 @@
 	AND	R4, R6
 	STBCCC	R6, (R3)
 	BNE	again
+	LWSYNC
 	RET
 
 // func Or(addr *uint32, v uint32)
@@ -367,6 +371,7 @@
 	OR	R4, R6
 	STWCCC	R6, (R3)
 	BNE	again
+	LWSYNC
 	RET
 
 // func And(addr *uint32, v uint32)
@@ -379,6 +384,7 @@
 	AND	R4, R6
 	STWCCC	R6, (R3)
 	BNE	again
+	LWSYNC
 	RET
 
 // func Or32(addr *uint32, v uint32) old uint32
@@ -391,6 +397,7 @@
 	OR	R4, R6, R7
 	STWCCC	R7, (R3)
 	BNE	again
+	LWSYNC
 	MOVW	R6, ret+16(FP)
 	RET
 
@@ -404,6 +411,7 @@
 	AND	R4, R6, R7
 	STWCCC	R7, (R3)
 	BNE	again
+	LWSYNC
 	MOVW	R6, ret+16(FP)
 	RET
 
@@ -417,6 +425,7 @@
 	OR	R4, R6, R7
 	STDCCC	R7, (R3)
 	BNE	again
+	LWSYNC
 	MOVD	R6, ret+16(FP)
 	RET
 
@@ -430,6 +439,7 @@
 	AND	R4, R6, R7
 	STDCCC	R7, (R3)
 	BNE	again
+	LWSYNC
 	MOVD	R6, ret+16(FP)
 	RET
 
diff --git a/test/fixedbugs/issue79186.go b/test/fixedbugs/issue79186.go
new file mode 100644
index 0000000..21bb6c8
--- /dev/null
+++ b/test/fixedbugs/issue79186.go
@@ -0,0 +1,67 @@
+// run
+
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Issue 79186: on ppc64le (POWER8/9), atomic add operations lacked a
+// post-barrier (acquire ordering), allowing loads after an RWMutex.RLock
+// to be speculatively reordered before the lock acquisition, causing
+// concurrent map read and map write.
+
+package main
+
+import (
+	"runtime"
+	"sync"
+)
+
+type M struct {
+	mu sync.RWMutex
+	m  map[int]int
+}
+
+func NewM() *M {
+	return &M{m: make(map[int]int)}
+}
+
+func (x *M) Get(k int) (int, bool) {
+	x.mu.RLock()
+	v, ok := x.m[k]
+	x.mu.RUnlock()
+	return v, ok
+}
+
+func (x *M) Set(k, v int) {
+	x.mu.Lock()
+	x.m[k] = v
+	x.mu.Unlock()
+}
+
+func main() {
+	runtime.GOMAXPROCS(2)
+
+	x := NewM()
+
+	const goroutines = 256
+	const iters = 200000
+
+	var wg sync.WaitGroup
+	wg.Add(goroutines)
+
+	for g := 0; g < goroutines; g++ {
+		go func(id int) {
+			defer wg.Done()
+			for i := 0; i < iters; i++ {
+				k := (id + i) & 15
+				if _, ok := x.Get(k); !ok {
+					x.Set(k, i)
+				} else if i&7 == 0 {
+					x.Set(k, i)
+				}
+			}
+		}(g)
+	}
+
+	wg.Wait()
+}