runtime: fix net poll races

The netpoll code was written long ago, when the
only multiprocessors that Go ran on were x86.
It assumed that an atomic store would trigger a
full memory barrier and then used that barrier
to order otherwise racy access to a handful of fields,
including pollDesc.closing.

On ARM64, this code has finally failed, because
the atomic store is on a value completely unrelated
to any of the racily-accessed fields, and the ARMv8
hardware, unlike x86, is clever enough not to do a
full memory barrier for a simple atomic store.
We are seeing a constant background rate of trybot
failures where the net/http tests deadlock - a netpollblock
has clearly happened after the pollDesc has begun to close.

The code that does the racy reads is netpollcheckerr,
which needs to be able to run without acquiring a lock.
This CL fixes the race, without introducing unnecessary
inefficiency or deadlock, by arranging for every updater
of the relevant fields to publish a summary as a single
atomic uint32, and then having netpollcheckerr use a
single atomic load to fetch the relevant bits and then
proceed as before.

Fixes #45211 (until proven otherwise!).

Change-Id: Ib6788c8da4d00b7bda84d55ca3fdffb5a64c1a0a
Reviewed-on: https://go-review.googlesource.com/c/go/+/378234
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Trust: Bryan Mills <bcmills@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 322a6f3..bb3dd35 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -71,31 +71,99 @@
 //go:notinheap
 type pollDesc struct {
 	link *pollDesc // in pollcache, protected by pollcache.lock
+	fd   uintptr   // constant for pollDesc usage lifetime
 
-	// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
-	// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
-	// pollReset, pollWait, pollWaitCanceled and runtime·netpollready (IO readiness notification)
-	// proceed w/o taking the lock. So closing, everr, rg, rd, wg and wd are manipulated
-	// in a lock-free way by all operations.
-	// TODO(golang.org/issue/49008): audit these lock-free fields for continued correctness.
-	// NOTE(dvyukov): the following code uses uintptr to store *g (rg/wg),
-	// that will blow up when GC starts moving objects.
+	// atomicInfo holds bits from closing, rd, and wd,
+	// which are only ever written while holding the lock,
+	// summarized for use by netpollcheckerr,
+	// which cannot acquire the lock.
+	// After writing these fields under lock in a way that
+	// might change the summary, code must call publishInfo
+	// before releasing the lock.
+	// Code that changes fields and then calls netpollunblock
+	// (while still holding the lock) must call publishInfo
+	// before calling netpollunblock, because publishInfo is what
+	// stops netpollblock from blocking anew
+	// (by changing the result of netpollcheckerr).
+	// atomicInfo also holds the eventErr bit,
+	// recording whether a poll event on the fd got an error;
+	// atomicInfo is the only source of truth for that bit.
+	atomicInfo atomic.Uint32 // atomic pollInfo
+
+	// rg, wg are accessed atomically and hold g pointers.
+	// (Using atomic.Uintptr here is similar to using guintptr elsewhere.)
+	rg atomic.Uintptr // pdReady, pdWait, G waiting for read or nil
+	wg atomic.Uintptr // pdReady, pdWait, G waiting for write or nil
+
 	lock    mutex // protects the following fields
-	fd      uintptr
 	closing bool
-	everr   bool      // marks event scanning error happened
 	user    uint32    // user settable cookie
 	rseq    uintptr   // protects from stale read timers
-	rg      uintptr   // pdReady, pdWait, G waiting for read or nil. Accessed atomically.
 	rt      timer     // read deadline timer (set if rt.f != nil)
-	rd      int64     // read deadline
+	rd      int64     // read deadline (a nanotime in the future, -1 when expired)
 	wseq    uintptr   // protects from stale write timers
-	wg      uintptr   // pdReady, pdWait, G waiting for write or nil. Accessed atomically.
 	wt      timer     // write deadline timer
-	wd      int64     // write deadline
+	wd      int64     // write deadline (a nanotime in the future, -1 when expired)
 	self    *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
 }
 
+// pollInfo is the bits needed by netpollcheckerr, stored atomically,
+// mostly duplicating state that is manipulated under lock in pollDesc.
+// The one exception is the pollEventErr bit, which is maintained only
+// in the pollInfo.
+type pollInfo uint32
+
+const (
+	pollClosing = 1 << iota
+	pollEventErr
+	pollExpiredReadDeadline
+	pollExpiredWriteDeadline
+)
+
+func (i pollInfo) closing() bool              { return i&pollClosing != 0 }
+func (i pollInfo) eventErr() bool             { return i&pollEventErr != 0 }
+func (i pollInfo) expiredReadDeadline() bool  { return i&pollExpiredReadDeadline != 0 }
+func (i pollInfo) expiredWriteDeadline() bool { return i&pollExpiredWriteDeadline != 0 }
+
+// info returns the pollInfo corresponding to pd.
+func (pd *pollDesc) info() pollInfo {
+	return pollInfo(pd.atomicInfo.Load())
+}
+
+// publishInfo updates pd.atomicInfo (returned by pd.info)
+// using the other values in pd.
+// It must be called while holding pd.lock,
+// and it must be called after changing anything
+// that might affect the info bits.
+// In practice this means after changing closing
+// or changing rd or wd from < 0 to >= 0.
+func (pd *pollDesc) publishInfo() {
+	var info uint32
+	if pd.closing {
+		info |= pollClosing
+	}
+	if pd.rd < 0 {
+		info |= pollExpiredReadDeadline
+	}
+	if pd.wd < 0 {
+		info |= pollExpiredWriteDeadline
+	}
+
+	// Set all of x except the pollEventErr bit.
+	x := pd.atomicInfo.Load()
+	for !pd.atomicInfo.CompareAndSwap(x, (x&pollEventErr)|info) {
+		x = pd.atomicInfo.Load()
+	}
+}
+
+// setEventErr sets the result of pd.info().eventErr() to b.
+func (pd *pollDesc) setEventErr(b bool) {
+	x := pd.atomicInfo.Load()
+	for (x&pollEventErr != 0) != b && !pd.atomicInfo.CompareAndSwap(x, x^pollEventErr) {
+		x = pd.atomicInfo.Load()
+	}
+}
+
 type pollCache struct {
 	lock  mutex
 	first *pollDesc
@@ -147,24 +215,25 @@
 func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd := pollcache.alloc()
 	lock(&pd.lock)
-	wg := atomic.Loaduintptr(&pd.wg)
+	wg := pd.wg.Load()
 	if wg != 0 && wg != pdReady {
 		throw("runtime: blocked write on free polldesc")
 	}
-	rg := atomic.Loaduintptr(&pd.rg)
+	rg := pd.rg.Load()
 	if rg != 0 && rg != pdReady {
 		throw("runtime: blocked read on free polldesc")
 	}
 	pd.fd = fd
 	pd.closing = false
-	pd.everr = false
+	pd.setEventErr(false)
 	pd.rseq++
-	atomic.Storeuintptr(&pd.rg, 0)
+	pd.rg.Store(0)
 	pd.rd = 0
 	pd.wseq++
-	atomic.Storeuintptr(&pd.wg, 0)
+	pd.wg.Store(0)
 	pd.wd = 0
 	pd.self = pd
+	pd.publishInfo()
 	unlock(&pd.lock)
 
 	errno := netpollopen(fd, pd)
@@ -180,11 +249,11 @@
 	if !pd.closing {
 		throw("runtime: close polldesc w/o unblock")
 	}
-	wg := atomic.Loaduintptr(&pd.wg)
+	wg := pd.wg.Load()
 	if wg != 0 && wg != pdReady {
 		throw("runtime: blocked write on closing polldesc")
 	}
-	rg := atomic.Loaduintptr(&pd.rg)
+	rg := pd.rg.Load()
 	if rg != 0 && rg != pdReady {
 		throw("runtime: blocked read on closing polldesc")
 	}
@@ -209,9 +278,9 @@
 		return errcode
 	}
 	if mode == 'r' {
-		atomic.Storeuintptr(&pd.rg, 0)
+		pd.rg.Store(0)
 	} else if mode == 'w' {
-		atomic.Storeuintptr(&pd.wg, 0)
+		pd.wg.Store(0)
 	}
 	return pollNoError
 }
@@ -273,6 +342,7 @@
 	if mode == 'w' || mode == 'r'+'w' {
 		pd.wd = d
 	}
+	pd.publishInfo()
 	combo := pd.rd > 0 && pd.rd == pd.wd
 	rtf := netpollReadDeadline
 	if combo {
@@ -314,15 +384,13 @@
 		}
 	}
 	// If we set the new deadline in the past, unblock currently pending IO if any.
+	// Note that pd.publishInfo has already been called, above, immediately after modifying rd and wd.
 	var rg, wg *g
-	if pd.rd < 0 || pd.wd < 0 {
-		atomic.StorepNoWB(noescape(unsafe.Pointer(&wg)), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
-		if pd.rd < 0 {
-			rg = netpollunblock(pd, 'r', false)
-		}
-		if pd.wd < 0 {
-			wg = netpollunblock(pd, 'w', false)
-		}
+	if pd.rd < 0 {
+		rg = netpollunblock(pd, 'r', false)
+	}
+	if pd.wd < 0 {
+		wg = netpollunblock(pd, 'w', false)
 	}
 	unlock(&pd.lock)
 	if rg != nil {
@@ -343,7 +411,7 @@
 	pd.rseq++
 	pd.wseq++
 	var rg, wg *g
-	atomic.StorepNoWB(noescape(unsafe.Pointer(&rg)), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
+	pd.publishInfo()
 	rg = netpollunblock(pd, 'r', false)
 	wg = netpollunblock(pd, 'w', false)
 	if pd.rt.f != nil {
@@ -388,16 +456,17 @@
 }
 
 func netpollcheckerr(pd *pollDesc, mode int32) int {
-	if pd.closing {
+	info := pd.info()
+	if info.closing() {
 		return pollErrClosing
 	}
-	if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
+	if (mode == 'r' && info.expiredReadDeadline()) || (mode == 'w' && info.expiredWriteDeadline()) {
 		return pollErrTimeout
 	}
 	// Report an event scanning error only on a read event.
 	// An error on a write event will be captured in a subsequent
 	// write call that is able to report a more specific error.
-	if mode == 'r' && pd.everr {
+	if mode == 'r' && info.eventErr() {
 		return pollErrNotPollable
 	}
 	return pollNoError
@@ -432,28 +501,28 @@
 	// set the gpp semaphore to pdWait
 	for {
 		// Consume notification if already ready.
-		if atomic.Casuintptr(gpp, pdReady, 0) {
+		if gpp.CompareAndSwap(pdReady, 0) {
 			return true
 		}
-		if atomic.Casuintptr(gpp, 0, pdWait) {
+		if gpp.CompareAndSwap(0, pdWait) {
 			break
 		}
 
 		// Double check that this isn't corrupt; otherwise we'd loop
 		// forever.
-		if v := atomic.Loaduintptr(gpp); v != pdReady && v != 0 {
+		if v := gpp.Load(); v != pdReady && v != 0 {
 			throw("runtime: double wait")
 		}
 	}
 
 	// need to recheck error states after setting gpp to pdWait
 	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
-	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
+	// do the opposite: store to closing/rd/wd, publishInfo, load of rg/wg
 	if waitio || netpollcheckerr(pd, mode) == pollNoError {
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
 	}
 	// be careful to not lose concurrent pdReady notification
-	old := atomic.Xchguintptr(gpp, 0)
+	old := gpp.Swap(0)
 	if old > pdWait {
 		throw("runtime: corrupted polldesc")
 	}
@@ -467,7 +536,7 @@
 	}
 
 	for {
-		old := atomic.Loaduintptr(gpp)
+		old := gpp.Load()
 		if old == pdReady {
 			return nil
 		}
@@ -480,7 +549,7 @@
 		if ioready {
 			new = pdReady
 		}
-		if atomic.Casuintptr(gpp, old, new) {
+		if gpp.CompareAndSwap(old, new) {
 			if old == pdWait {
 				old = 0
 			}
@@ -508,7 +577,7 @@
 			throw("runtime: inconsistent read deadline")
 		}
 		pd.rd = -1
-		atomic.StorepNoWB(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
+		pd.publishInfo()
 		rg = netpollunblock(pd, 'r', false)
 	}
 	var wg *g
@@ -517,7 +586,7 @@
 			throw("runtime: inconsistent write deadline")
 		}
 		pd.wd = -1
-		atomic.StorepNoWB(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
+		pd.publishInfo()
 		wg = netpollunblock(pd, 'w', false)
 	}
 	unlock(&pd.lock)
diff --git a/src/runtime/netpoll_aix.go b/src/runtime/netpoll_aix.go
index 4590ed8..90950af 100644
--- a/src/runtime/netpoll_aix.go
+++ b/src/runtime/netpoll_aix.go
@@ -212,10 +212,7 @@
 			pfd.events &= ^_POLLOUT
 		}
 		if mode != 0 {
-			pds[i].everr = false
-			if pfd.revents == _POLLERR {
-				pds[i].everr = true
-			}
+			pds[i].setEventErr(pfd.revents == _POLLERR)
 			netpollready(&toRun, pds[i], mode)
 			n--
 		}
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index e0fb877..b7d6199 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go
@@ -168,10 +168,7 @@
 		}
 		if mode != 0 {
 			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
-			pd.everr = false
-			if ev.events == _EPOLLERR {
-				pd.everr = true
-			}
+			pd.setEventErr(ev.events == _EPOLLERR)
 			netpollready(&toRun, pd, mode)
 		}
 	}
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 2f7f284..1694753 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -179,10 +179,7 @@
 		}
 		if mode != 0 {
 			pd := (*pollDesc)(unsafe.Pointer(ev.udata))
-			pd.everr = false
-			if ev.flags == _EV_ERROR {
-				pd.everr = true
-			}
+			pd.setEventErr(ev.flags == _EV_ERROR)
 			netpollready(&toRun, pd, mode)
 		}
 	}
diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index d217d5b..6e545b3d 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go
@@ -158,7 +158,7 @@
 // this call, port_getn will return one and only one event for that
 // particular descriptor, so this function needs to be called again.
 func netpollupdate(pd *pollDesc, set, clear uint32) {
-	if pd.closing {
+	if pd.info().closing() {
 		return
 	}
 
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index ec34e7a..d0b7a16 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -255,6 +255,8 @@
 // so I can't see them ever moving. If we did want to start moving data
 // in the GC, we'd need to allocate the goroutine structs from an
 // alternate arena. Using guintptr doesn't make that problem any worse.
+// Note that pollDesc.rg, pollDesc.wg also store g in uintptr form,
+// so they would need to be updated too if g's start moving.
 type guintptr uintptr
 
 //go:nosplit