runtime: clean up new lock2 structure

Simplify some flow control, as suggested on https://go.dev/cl/620435.

The MutexCapture microbenchmark shows a bit of throughput improvement at
moderate levels of contention, and little change to capture and
starvation. (Note that the capture and starvation figures below are in
terms of power-of-two buckets multiplied by throughput, so they either
follow similar patterns or move by a factor of two.)

For #68578

goos: linux
goarch: amd64
pkg: runtime
cpu: 13th Gen Intel(R) Core(TM) i7-13700H
                │     old      │                 new                  │
                │    sec/op    │    sec/op     vs base                │
MutexCapture      18.21n ±  0%   18.35n ±  0%   +0.77% (p=0.000 n=10)
MutexCapture-2    21.46n ±  8%   21.05n ± 12%        ~ (p=0.796 n=10)
MutexCapture-3    22.56n ±  9%   22.59n ± 18%        ~ (p=0.631 n=10)
MutexCapture-4    22.85n ±  5%   22.74n ±  2%        ~ (p=0.565 n=10)
MutexCapture-5    22.84n ±  5%   22.50n ± 14%        ~ (p=0.912 n=10)
MutexCapture-6    23.33n ± 14%   22.22n ±  3%   -4.78% (p=0.004 n=10)
MutexCapture-7    27.04n ± 14%   23.78n ± 15%        ~ (p=0.089 n=10)
MutexCapture-8    25.44n ± 10%   23.03n ±  6%   -9.48% (p=0.004 n=10)
MutexCapture-9    25.56n ±  7%   24.39n ± 11%        ~ (p=0.218 n=10)
MutexCapture-10   26.77n ± 10%   24.00n ±  7%  -10.33% (p=0.023 n=10)
MutexCapture-11   27.02n ±  7%   24.55n ± 15%   -9.18% (p=0.035 n=10)
MutexCapture-12   26.71n ±  8%   24.96n ±  8%        ~ (p=0.148 n=10)
MutexCapture-13   25.58n ±  4%   25.82n ±  5%        ~ (p=0.271 n=10)
MutexCapture-14   26.86n ±  6%   25.91n ±  7%        ~ (p=0.529 n=10)
MutexCapture-15   25.12n ± 13%   26.16n ±  4%        ~ (p=0.353 n=10)
MutexCapture-16   26.18n ±  4%   26.21n ±  9%        ~ (p=0.838 n=10)
MutexCapture-17   26.04n ±  4%   25.85n ±  5%        ~ (p=0.363 n=10)
MutexCapture-18   26.02n ±  7%   25.93n ±  5%        ~ (p=0.853 n=10)
MutexCapture-19   25.67n ±  5%   26.21n ±  4%        ~ (p=0.631 n=10)
MutexCapture-20   25.50n ±  6%   25.99n ±  8%        ~ (p=0.404 n=10)
geomean           24.73n         24.02n         -2.88%

                │      old       │                  new                   │
                │ sec/streak-p90 │ sec/streak-p90  vs base                │
MutexCapture        76.36m ±  0%    76.96m ±   0%   +0.79% (p=0.000 n=10)
MutexCapture-2     10.609µ ± 50%    5.390µ ± 119%        ~ (p=0.579 n=10)
MutexCapture-3      5.936µ ± 93%    5.782µ ±  18%        ~ (p=0.684 n=10)
MutexCapture-4      5.849µ ±  5%    5.820µ ±   2%        ~ (p=0.579 n=10)
MutexCapture-5      5.849µ ±  5%    5.759µ ±  14%        ~ (p=0.912 n=10)
MutexCapture-6      5.975µ ± 14%    5.687µ ±   3%   -4.81% (p=0.004 n=10)
MutexCapture-7      6.921µ ± 14%    6.086µ ±  18%        ~ (p=0.165 n=10)
MutexCapture-8      6.512µ ± 10%    5.894µ ±   6%   -9.50% (p=0.004 n=10)
MutexCapture-9      6.544µ ±  7%    6.245µ ±  11%        ~ (p=0.218 n=10)
MutexCapture-10     6.962µ ± 11%    6.144µ ±   7%  -11.76% (p=0.023 n=10)
MutexCapture-11     6.938µ ±  7%    6.284µ ± 130%        ~ (p=0.190 n=10)
MutexCapture-12     6.838µ ±  8%    6.408µ ±  13%        ~ (p=0.404 n=10)
MutexCapture-13     6.549µ ±  4%    6.608µ ±   5%        ~ (p=0.271 n=10)
MutexCapture-14     6.877µ ±  8%    6.634µ ±   7%        ~ (p=0.436 n=10)
MutexCapture-15     6.433µ ± 13%    6.697µ ±   4%        ~ (p=0.247 n=10)
MutexCapture-16     6.702µ ± 10%    6.711µ ± 116%        ~ (p=0.796 n=10)
MutexCapture-17     6.730µ ±  3%    6.619µ ±   5%        ~ (p=0.225 n=10)
MutexCapture-18     6.663µ ±  7%    6.716µ ±  13%        ~ (p=0.853 n=10)
MutexCapture-19     6.570µ ±  5%    6.710µ ±   4%        ~ (p=0.529 n=10)
MutexCapture-20     6.528µ ±  6%    6.775µ ±  11%        ~ (p=0.247 n=10)
geomean             10.66µ          10.00µ          -6.13%

                │      old       │                  new                   │
                │ sec/starve-p90 │ sec/starve-p90  vs base                │
MutexCapture-2    10.609µ ±  50%    5.390µ ± 119%        ~ (p=0.579 n=10)
MutexCapture-3     184.8µ ±  91%    183.9µ ±  48%        ~ (p=0.436 n=10)
MutexCapture-4     388.8µ ± 270%    375.6µ ± 280%        ~ (p=0.436 n=10)
MutexCapture-5     807.2µ ±  83%   2880.9µ ±  85%        ~ (p=0.105 n=10)
MutexCapture-6     2.272m ±  61%    2.173m ±  34%        ~ (p=0.280 n=10)
MutexCapture-7     1.351m ± 125%    2.990m ±  70%        ~ (p=0.393 n=10)
MutexCapture-8     3.328m ±  97%    3.064m ±  96%        ~ (p=0.739 n=10)
MutexCapture-9     3.526m ±  91%    3.081m ±  47%  -12.62% (p=0.015 n=10)
MutexCapture-10    3.641m ±  86%    3.228m ±  90%  -11.34% (p=0.005 n=10)
MutexCapture-11    3.324m ± 109%    3.190m ±  71%        ~ (p=0.481 n=10)
MutexCapture-12    3.519m ±  77%    3.200m ± 106%        ~ (p=0.393 n=10)
MutexCapture-13    3.353m ±  91%    3.368m ±  99%        ~ (p=0.853 n=10)
MutexCapture-14    3.314m ± 101%    3.396m ± 286%        ~ (p=0.353 n=10)
MutexCapture-15    3.534m ±  83%    3.397m ±  91%        ~ (p=0.739 n=10)
MutexCapture-16    3.485m ±  90%    3.436m ± 116%        ~ (p=0.853 n=10)
MutexCapture-17    6.516m ±  48%    3.452m ±  88%        ~ (p=0.190 n=10)
MutexCapture-18    6.645m ± 105%    3.439m ± 108%        ~ (p=0.218 n=10)
MutexCapture-19    6.521m ±  46%    4.907m ±  42%        ~ (p=0.529 n=10)
MutexCapture-20    6.532m ±  47%    3.516m ±  89%        ~ (p=0.089 n=10)
geomean            1.919m           1.783m          -7.06%

Change-Id: I36106e1baf8afd132f1568748d1b83b797fa260e
Reviewed-on: https://go-review.googlesource.com/c/go/+/629415
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
diff --git a/src/runtime/lock_spinbit.go b/src/runtime/lock_spinbit.go
index 1f9f289..7e84f3e 100644
--- a/src/runtime/lock_spinbit.go
+++ b/src/runtime/lock_spinbit.go
@@ -159,9 +159,8 @@
 
 	k8 := key8(&l.key)
 
-	var v8 uint8
 	// Speculative grab for lock.
-	v8 = atomic.Xchg8(k8, mutexLocked)
+	v8 := atomic.Xchg8(k8, mutexLocked)
 	if v8&mutexLocked == 0 {
 		if v8&mutexSleeping != 0 {
 			atomic.Or8(k8, mutexSleeping)
@@ -183,11 +182,13 @@
 	v := atomic.Loaduintptr(&l.key)
 tryAcquire:
 	for i := 0; ; i++ {
-		for v&mutexLocked == 0 {
+		if v&mutexLocked == 0 {
 			if weSpin {
-				next := (v &^ mutexMMask) | (v & (mutexMMask &^ mutexSpinning)) | mutexLocked
-				if next&^mutexMMask != 0 {
-					next |= mutexSleeping
+				next := (v &^ mutexSpinning) | mutexSleeping | mutexLocked
+				if next&^mutexMMask == 0 {
+					// The fast-path Xchg8 may have cleared mutexSleeping. Fix
+					// the hint so unlock2 knows when to use its slow path.
+					next = next &^ mutexSleeping
 				}
 				if atomic.Casuintptr(&l.key, v, next) {
 					timer.end()
@@ -201,6 +202,7 @@
 				}
 			}
 			v = atomic.Loaduintptr(&l.key)
+			continue tryAcquire
 		}
 
 		if !weSpin && v&mutexSpinning == 0 && atomic.Casuintptr(&l.key, v, v|mutexSpinning) {
@@ -214,35 +216,36 @@
 				v = atomic.Loaduintptr(&l.key)
 				continue tryAcquire
 			} else if i < spin+mutexPassiveSpinCount {
-				osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268
+				osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268.
 				v = atomic.Loaduintptr(&l.key)
 				continue tryAcquire
 			}
 		}
 
 		// Go to sleep
-		for v&mutexLocked != 0 {
-			// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
-			gp.m.mWaitList.next = mutexWaitListHead(v)
-
-			// Pack a (partial) pointer to this M with the current lock state bits
-			next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
-			if weSpin { // If we were spinning, prepare to retire
-				next = next &^ mutexSpinning
-			}
-
-			if atomic.Casuintptr(&l.key, v, next) {
-				weSpin = false
-				// We've pushed ourselves onto the stack of waiters. Wait.
-				semasleep(-1)
-				atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
-				gp.m.mWaitList.next = 0
-				i = 0
-				v = atomic.Loaduintptr(&l.key)
-				continue tryAcquire
-			}
-			v = atomic.Loaduintptr(&l.key)
+		if v&mutexLocked == 0 {
+			throw("runtime·lock: sleeping while lock is available")
 		}
+
+		// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
+		gp.m.mWaitList.next = mutexWaitListHead(v)
+
+		// Pack a (partial) pointer to this M with the current lock state bits
+		next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
+		if weSpin { // If we were spinning, prepare to retire
+			next = next &^ mutexSpinning
+		}
+
+		if atomic.Casuintptr(&l.key, v, next) {
+			weSpin = false
+			// We've pushed ourselves onto the stack of waiters. Wait.
+			semasleep(-1)
+			atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
+			i = 0
+		}
+
+		gp.m.mWaitList.next = 0
+		v = atomic.Loaduintptr(&l.key)
 	}
 }