runtime:  static lock ranking for the runtime (enabled by GOEXPERIMENT)

I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.

Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.

Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.

See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.

I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.

I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).

Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.

For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.

Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.

To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.

Fixes #38029

Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/cmd/internal/objabi/stack.go b/src/cmd/internal/objabi/stack.go
index 7320dbf..05a1d4a 100644
--- a/src/cmd/internal/objabi/stack.go
+++ b/src/cmd/internal/objabi/stack.go
@@ -18,7 +18,7 @@
 )
 
 // Initialize StackGuard and StackLimit according to target system.
-var StackGuard = 896*stackGuardMultiplier() + StackSystem
+var StackGuard = 928*stackGuardMultiplier() + StackSystem
 var StackLimit = StackGuard - StackSystem - StackSmall
 
 // stackGuardMultiplier returns a multiplier to apply to the default
diff --git a/src/cmd/internal/objabi/util.go b/src/cmd/internal/objabi/util.go
index 4f8ba3d..8d05a6b 100644
--- a/src/cmd/internal/objabi/util.go
+++ b/src/cmd/internal/objabi/util.go
@@ -152,9 +152,10 @@
 }
 
 var (
-	framepointer_enabled     int = 1
-	Fieldtrack_enabled       int
-	Preemptibleloops_enabled int
+	framepointer_enabled      int = 1
+	Fieldtrack_enabled        int
+	Preemptibleloops_enabled  int
+	Staticlockranking_enabled int
 )
 
 // Toolchain experiments.
@@ -168,6 +169,7 @@
 	{"fieldtrack", &Fieldtrack_enabled},
 	{"framepointer", &framepointer_enabled},
 	{"preemptibleloops", &Preemptibleloops_enabled},
+	{"staticlockranking", &Staticlockranking_enabled},
 }
 
 var defaultExpstring = Expstring()
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 1d4599e..f6f4ffd 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -109,6 +109,7 @@
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
+	lockInit(&c.lock, lockRankHchan)
 
 	if debugChan {
 		print("makechan: chan=", c, "; elemsize=", elem.size, "; dataqsiz=", size, "\n")
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 6737979..4c1150a 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -790,6 +790,7 @@
 
 	// We've got an entry, so initialize the pageAlloc.
 	p.init(new(mutex), nil)
+	lockInit(p.mheapLock, lockRankMheap)
 	p.test = true
 
 	for i, init := range chunks {
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index e4b0b6d..0504b89 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -243,6 +243,7 @@
 }
 
 func itabsinit() {
+	lockInit(&itabLock, lockRankItab)
 	lock(&itabLock)
 	for _, md := range activeModules() {
 		for _, i := range md.itablinks {
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 92873f2..b0395d6 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -44,6 +44,10 @@
 }
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 
 	if gp.m.locks < 0 {
@@ -104,6 +108,10 @@
 }
 
 func unlock(l *mutex) {
+	lockRankRelease(l)
+}
+
+func unlock2(l *mutex) {
 	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
 	if v == mutex_unlocked {
 		throw("unlock of unlocked lock")
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go
index 3168c86..7a720f4 100644
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -26,6 +26,10 @@
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	if l.key == mutex_locked {
 		// js/wasm is single-threaded so we should never
 		// observe this.
@@ -40,6 +44,10 @@
 }
 
 func unlock(l *mutex) {
+	lockRankRelease(l)
+}
+
+func unlock2(l *mutex) {
 	if l.key == mutex_unlocked {
 		throw("unlock of unlocked lock")
 	}
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index af9517d..d79520d 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -33,6 +33,10 @@
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 	if gp.m.locks < 0 {
 		throw("runtime·lock: lock count")
@@ -89,9 +93,13 @@
 	}
 }
 
+func unlock(l *mutex) {
+	lockRankRelease(l)
+}
+
 //go:nowritebarrier
 // We might not be holding a p in this code.
-func unlock(l *mutex) {
+func unlock2(l *mutex) {
 	gp := getg()
 	var mp *m
 	for {
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
new file mode 100644
index 0000000..4b7273a
--- /dev/null
+++ b/src/runtime/lockrank.go
@@ -0,0 +1,234 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file records the static ranks of the locks in the runtime. If a lock
+// is not given a rank, then it is assumed to be a leaf lock, which means no other
+// lock can be acquired while it is held. Therefore, leaf locks do not need to be
+// given an explicit rank. We list all of the architecture-independent leaf locks
+// for documentation purposes, but don't list any of the architecture-dependent
+// locks (which are all leaf locks). debugLock is ignored for ranking, since it is used
+// when printing out lock ranking errors.
+//
+// lockInit(l *mutex, rank int) is used to set the rank of lock before it is used.
+// If there is no clear place to initialize a lock, then the rank of a lock can be
+// specified during the lock call itself via lockWithrank(l *mutex, rank int).
+//
+// Besides the static lock ranking (which is a total ordering of the locks), we
+// also represent and enforce the actual partial order among the locks in the
+// arcs[] array below. That is, if it is possible that lock B can be acquired when
+// lock A is the previous acquired lock that is still held, then there should be
+// an entry for A in arcs[B][]. We will currently fail not only if the total order
+// (the lock ranking) is violated, but also if there is a missing entry in the
+// partial order.
+
+package runtime
+
+type lockRank int
+
+// Constants representing the lock rank of the architecture-independent locks in
+// the runtime.
+const (
+	lockRankDummy lockRank = iota
+
+	// Locks held above sched
+	lockRankScavenge
+	lockRankForcegc
+	lockRankSweepWaiters
+	lockRankAssistQueue
+	lockRankCpuprof
+	lockRankSweep
+
+	lockRankSched
+	lockRankDeadlock
+	lockRankPanic
+	lockRankAllg
+	lockRankAllp
+	lockRankPollDesc
+
+	lockRankTimers // Multiple timers locked simultaneously in destroy()
+	lockRankItab
+	lockRankReflectOffs
+	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankFin
+	lockRankNotifyList
+	lockRankTraceBuf
+	lockRankTraceStrings
+	lockRankMspanSpecial
+	lockRankProf
+	lockRankGcBitsArenas
+	lockRankRoot
+	lockRankTrace
+	lockRankTraceStackTab
+	lockRankNetpollInit
+
+	lockRankRwmutexW
+	lockRankRwmutexR
+
+	lockRankMcentral
+	lockRankSpine
+	lockRankStackpool
+	lockRankStackLarge
+	lockRankDefer
+	lockRankSudog
+
+	// Memory-related non-leaf locks
+	lockRankWbufSpans
+	lockRankMheap
+
+	// Memory-related leaf locks
+	lockRankMheapSpecial
+	lockRankGlobalAlloc
+
+	// Other leaf locks
+	lockRankGFree
+
+	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
+	// There are other architecture-dependent leaf locks as well.
+	lockRankNewmHandoff
+	lockRankDebugPtrmask
+	lockRankFaketimeState
+	lockRankTicks
+	lockRankRaceFini
+	lockRankPollCache
+	lockRankDebug
+)
+
+// lockRankLeafRank is the rank of lock that does not have a declared rank, and hence is
+// a leaf lock.
+const lockRankLeafRank lockRank = 1000
+
+// lockNames gives the names associated with each of the above ranks
+var lockNames = []string{
+	lockRankDummy: "",
+
+	lockRankScavenge:     "scavenge",
+	lockRankForcegc:      "forcegc",
+	lockRankSweepWaiters: "sweepWaiters",
+	lockRankAssistQueue:  "assistQueue",
+	lockRankCpuprof:      "cpuprof",
+	lockRankSweep:        "sweep",
+
+	lockRankSched:    "sched",
+	lockRankDeadlock: "deadlock",
+	lockRankPanic:    "panic",
+	lockRankAllg:     "allg",
+	lockRankAllp:     "allp",
+	lockRankPollDesc: "pollDesc",
+
+	lockRankTimers:      "timers",
+	lockRankItab:        "itab",
+	lockRankReflectOffs: "reflectOffs",
+
+	lockRankHchan:         "hchan",
+	lockRankFin:           "fin",
+	lockRankNotifyList:    "notifyList",
+	lockRankTraceBuf:      "traceBuf",
+	lockRankTraceStrings:  "traceStrings",
+	lockRankMspanSpecial:  "mspanSpecial",
+	lockRankProf:          "prof",
+	lockRankGcBitsArenas:  "gcBitsArenas",
+	lockRankRoot:          "root",
+	lockRankTrace:         "trace",
+	lockRankTraceStackTab: "traceStackTab",
+	lockRankNetpollInit:   "netpollInit",
+
+	lockRankRwmutexW: "rwmutexW",
+	lockRankRwmutexR: "rwmutexR",
+
+	lockRankMcentral:   "mcentral",
+	lockRankSpine:      "spine",
+	lockRankStackpool:  "stackpool",
+	lockRankStackLarge: "stackLarge",
+	lockRankDefer:      "defer",
+	lockRankSudog:      "sudog",
+
+	lockRankWbufSpans: "wbufSpans",
+	lockRankMheap:     "mheap",
+
+	lockRankMheapSpecial: "mheapSpecial",
+	lockRankGlobalAlloc:  "globalAlloc.mutex",
+
+	lockRankGFree: "gFree",
+
+	lockRankNewmHandoff:   "newmHandoff.lock",
+	lockRankDebugPtrmask:  "debugPtrmask.lock",
+	lockRankFaketimeState: "faketimeState.lock",
+	lockRankTicks:         "ticks.lock",
+	lockRankRaceFini:      "raceFiniLock",
+	lockRankPollCache:     "pollCache.lock",
+	lockRankDebug:         "debugLock",
+}
+
+func (rank lockRank) String() string {
+	if rank == 0 {
+		return "UNKNOWN"
+	}
+	if rank == lockRankLeafRank {
+		return "LEAF"
+	}
+	return lockNames[rank]
+}
+
+// lockPartialOrder is a partial order among the various lock types, listing the immediate
+// ordering that has actually been observed in the runtime. Each entry (which
+// corresponds to a particular lock rank) specifies the list of locks that can be
+// already be held immediately "above" it.
+//
+// So, for example, the lockRankSched entry shows that all the locks preceding it in
+// rank can actually be held. The fin lock shows that only the sched, timers, or
+// hchan lock can be held immediately above it when it is acquired.
+var lockPartialOrder [][]lockRank = [][]lockRank{
+	lockRankDummy:         {},
+	lockRankScavenge:      {},
+	lockRankForcegc:       {},
+	lockRankSweepWaiters:  {},
+	lockRankAssistQueue:   {},
+	lockRankCpuprof:       {},
+	lockRankSweep:         {},
+	lockRankSched:         {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankDeadlock:      {lockRankDeadlock},
+	lockRankPanic:         {lockRankDeadlock},
+	lockRankAllg:          {lockRankSched, lockRankPanic},
+	lockRankAllp:          {lockRankSched},
+	lockRankPollDesc:      {},
+	lockRankTimers:        {lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
+	lockRankItab:          {},
+	lockRankReflectOffs:   {lockRankItab},
+	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
+	lockRankFin:           {lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan},
+	lockRankNotifyList:    {},
+	lockRankTraceBuf:      {},
+	lockRankTraceStrings:  {lockRankTraceBuf},
+	lockRankMspanSpecial:  {lockRankScavenge, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProf:          {lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGcBitsArenas:  {lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankRoot:          {},
+	lockRankTrace:         {lockRankScavenge, lockRankAssistQueue, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankSweep},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankNetpollInit:   {lockRankTimers},
+
+	lockRankRwmutexW: {},
+	lockRankRwmutexR: {lockRankRwmutexW},
+
+	lockRankMcentral:     {lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpine:        {lockRankScavenge, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankStackpool:    {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankMcentral, lockRankSpine},
+	lockRankStackLarge:   {lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral},
+	lockRankDefer:        {},
+	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
+	lockRankWbufSpans:    {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial: {lockRankScavenge, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGlobalAlloc:  {lockRankSpine, lockRankMheap},
+
+	lockRankGFree: {lockRankSched},
+
+	lockRankNewmHandoff:   {},
+	lockRankDebugPtrmask:  {},
+	lockRankFaketimeState: {},
+	lockRankTicks:         {},
+	lockRankRaceFini:      {},
+	lockRankPollCache:     {},
+	lockRankDebug:         {},
+}
diff --git a/src/runtime/lockrank_off.go b/src/runtime/lockrank_off.go
new file mode 100644
index 0000000..fcfcff5
--- /dev/null
+++ b/src/runtime/lockrank_off.go
@@ -0,0 +1,26 @@
+// +build !goexperiment.staticlockranking
+
+package runtime
+
+// // lockRankStruct is embedded in mutex, but is empty when staticklockranking is
+// disabled (the default)
+type lockRankStruct struct {
+}
+
+func lockInit(l *mutex, rank lockRank) {
+}
+
+func getLockRank(l *mutex) lockRank {
+	return 0
+}
+
+func lockRankRelease(l *mutex) {
+	unlock2(l)
+}
+
+func lockWithRank(l *mutex, rank lockRank) {
+	lock2(l)
+}
+
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+}
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
new file mode 100644
index 0000000..fc72a06
--- /dev/null
+++ b/src/runtime/lockrank_on.go
@@ -0,0 +1,160 @@
+// +build goexperiment.staticlockranking
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// lockRankStruct is embedded in mutex
+type lockRankStruct struct {
+	// static lock ranking of the lock
+	rank lockRank
+	// pad field to make sure lockRankStruct is a multiple of 8 bytes, even on
+	// 32-bit systems.
+	pad int
+}
+
+// init checks that the partial order in lockPartialOrder fits within the total
+// order determined by the order of the lockRank constants.
+func init() {
+	for rank, list := range lockPartialOrder {
+		for _, entry := range list {
+			if entry > lockRank(rank) {
+				println("lockPartial order row", lockRank(rank).String(), "entry", entry.String())
+				throw("lockPartialOrder table is inconsistent with total lock ranking order")
+			}
+		}
+	}
+}
+
+func lockInit(l *mutex, rank lockRank) {
+	l.rank = rank
+}
+
+func getLockRank(l *mutex) lockRank {
+	return l.rank
+}
+
+// The following functions are the entry-points to record lock
+// operations.
+// All of these are nosplit and switch to the system stack immediately
+// to avoid stack growths. Since a stack growth could itself have lock
+// operations, this prevents re-entrant calls.
+
+// lockWithRank is like lock(l), but allows the caller to specify a lock rank
+// when acquiring a non-static lock.
+//go:nosplit
+func lockWithRank(l *mutex, rank lockRank) {
+	if l == &debuglock {
+		// debuglock is only used for println/printlock(). Don't do lock rank
+		// recording for it, since print/println are used when printing
+		// out a lock ordering problem below.
+		lock2(l)
+		return
+	}
+	if rank == 0 {
+		rank = lockRankLeafRank
+	}
+	gp := getg()
+	// Log the new class.
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+
+		// i is the index of the lock being acquired
+		if i > 0 {
+			checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		}
+		lock2(l)
+	})
+}
+
+func checkRanks(gp *g, prevRank, rank lockRank) {
+	rankOK := false
+	// If rank < prevRank, then we definitely have a rank error
+	if prevRank <= rank {
+		if rank == lockRankLeafRank {
+			// If new lock is a leaf lock, then the preceding lock can
+			// be anything except another leaf lock.
+			rankOK = prevRank < lockRankLeafRank
+		} else {
+			// We've already verified the total lock ranking, but we
+			// also enforce the partial ordering specified by
+			// lockPartialOrder as well. Two locks with the same rank
+			// can only be acquired at the same time if explicitly
+			// listed in the lockPartialOrder table.
+			list := lockPartialOrder[rank]
+			for _, entry := range list {
+				if entry == prevRank {
+					rankOK = true
+					break
+				}
+			}
+		}
+	}
+	if !rankOK {
+		printlock()
+		println(gp.m.procid, " ======")
+		for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
+			println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
+		}
+		throw("lock ordering problem")
+	}
+}
+
+//go:nosplit
+func lockRankRelease(l *mutex) {
+	if l == &debuglock {
+		// debuglock is only used for print/println. Don't do lock rank
+		// recording for it, since print/println are used when printing
+		// out a lock ordering problem below.
+		unlock2(l)
+		return
+	}
+	gp := getg()
+	systemstack(func() {
+		found := false
+		for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+			if gp.m.locksHeld[i].lockAddr == uintptr(unsafe.Pointer(l)) {
+				found = true
+				copy(gp.m.locksHeld[i:gp.m.locksHeldLen-1], gp.m.locksHeld[i+1:gp.m.locksHeldLen])
+				gp.m.locksHeldLen--
+			}
+		}
+		if !found {
+			println(gp.m.procid, ":", l.rank.String(), l.rank, l)
+			throw("unlock without matching lock acquire")
+		}
+		unlock2(l)
+	})
+}
+
+//go:nosplit
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+	gp := getg()
+	if gp.m.locksHeldLen == 0 {
+		// No possibilty of lock ordering problem if no other locks held
+		return
+	}
+
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		// Temporarily add this lock to the locksHeld list, so
+		// checkRanks() will print out list, including this lock, if there
+		// is a lock ordering problem.
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+		checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		gp.m.locksHeldLen--
+	})
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 346d7f4..5a0d85f 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -469,6 +469,9 @@
 	// Initialize the heap.
 	mheap_.init()
 	mcache0 = allocmcache()
+	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
+	lockInit(&proflock, lockRankProf)
+	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
 
 	// Create initial arena growth hints.
 	if sys.PtrSize == 8 {
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index 78a3ae6..fd0035b 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -34,6 +34,7 @@
 	c.spanclass = spc
 	c.nonempty.init()
 	c.empty.init()
+	lockInit(&c.lock, lockRankMcentral)
 }
 
 // Allocate a span to use in an mcache.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 7a8ab53..08159e2 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -191,6 +191,9 @@
 
 	work.startSema = 1
 	work.markDoneSema = 1
+	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
+	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
+	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 }
 
 func readgogc() int32 {
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index c262509..5a85505 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -225,6 +225,7 @@
 func bgscavenge(c chan int) {
 	scavenge.g = getg()
 
+	lockInit(&scavenge.lock, lockRankScavenge)
 	lock(&scavenge.lock)
 	scavenge.parked = true
 
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index fd9bf8f..c075f66 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -64,6 +64,7 @@
 func bgsweep(c chan int) {
 	sweep.g = getg()
 
+	lockInit(&sweep.lock, lockRankSweep)
 	lock(&sweep.lock)
 	sweep.parked = true
 	c <- 1
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 927b06c..4610165 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -178,6 +178,10 @@
 
 	flushed := false
 	wbuf := w.wbuf1
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if wbuf == nil {
 		w.init()
 		wbuf = w.wbuf1
@@ -423,6 +427,10 @@
 			b.checkempty()
 		}
 	}
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if b == nil {
 		// Allocate more workbufs.
 		var s *mspan
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 86ecf33..9774dfb 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -670,6 +670,10 @@
 
 // Initialize the heap.
 func (h *mheap) init() {
+	lockInit(&h.lock, lockRankMheap)
+	lockInit(&h.sweepSpans[0].spineLock, lockRankSpine)
+	lockInit(&h.sweepSpans[1].spineLock, lockRankSpine)
+
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -1474,6 +1478,7 @@
 	span.allocBits = nil
 	span.gcmarkBits = nil
 	span.state.set(mSpanDead)
+	lockInit(&span.speciallock, lockRankMspanSpecial)
 }
 
 func (span *mspan) inList() bool {
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index a332045..34ea82a 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -116,6 +116,7 @@
 
 func netpollGenericInit() {
 	if atomic.Load(&netpollInited) == 0 {
+		lockInit(&netpollInitLock, lockRankNetpollInit)
 		lock(&netpollInitLock)
 		if netpollInited == 0 {
 			netpollinit()
@@ -542,6 +543,7 @@
 	}
 	pd := c.first
 	c.first = pd.link
+	lockInit(&pd.lock, lockRankPollDesc)
 	unlock(&c.lock)
 	return pd
 }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index c7097e2..202c300 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -245,6 +245,7 @@
 
 func forcegchelper() {
 	forcegc.g = getg()
+	lockInit(&forcegc.lock, lockRankForcegc)
 	for {
 		lock(&forcegc.lock)
 		if forcegc.idle != 0 {
@@ -531,6 +532,21 @@
 //
 // The new G calls runtime·main.
 func schedinit() {
+	lockInit(&sched.lock, lockRankSched)
+	lockInit(&sched.deferlock, lockRankDefer)
+	lockInit(&sched.sudoglock, lockRankSudog)
+	lockInit(&deadlock, lockRankDeadlock)
+	lockInit(&paniclk, lockRankPanic)
+	lockInit(&allglock, lockRankAllg)
+	lockInit(&allpLock, lockRankAllp)
+	lockInit(&reflectOffs.lock, lockRankReflectOffs)
+	lockInit(&finlock, lockRankFin)
+	lockInit(&trace.bufLock, lockRankTraceBuf)
+	lockInit(&trace.stringsLock, lockRankTraceStrings)
+	lockInit(&trace.lock, lockRankTrace)
+	lockInit(&cpuprof.lock, lockRankCpuprof)
+	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
 	_g_ := getg()
@@ -4120,6 +4136,7 @@
 			pp.raceprocctx = raceproccreate()
 		}
 	}
+	lockInit(&pp.timersLock, lockRankTimers)
 }
 
 // destroy releases all of the resources associated with pp and
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 1a98927..15e24c8 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -158,7 +158,10 @@
 // as fast as spin locks (just a few user-level instructions),
 // but on the contention path they sleep in the kernel.
 // A zeroed Mutex is unlocked (no need to initialize each lock).
+// Initialization is helpful for static lock ranking, but not required.
 type mutex struct {
+	// Empty struct if lock ranking is disabled, otherwise includes the lock rank
+	lockRankStruct
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
 	// Used to be a union, but unions break precise GC.
@@ -392,6 +395,12 @@
 	hi uintptr
 }
 
+// heldLockInfo gives info on a held lock and the rank of that lock
+type heldLockInfo struct {
+	lockAddr uintptr
+	rank     lockRank
+}
+
 type g struct {
 	// Stack parameters.
 	// stack describes the actual stack memory: [stack.lo, stack.hi).
@@ -546,6 +555,10 @@
 	dlogPerM
 
 	mOS
+
+	// Up to 10 locks held by this m, maintained by the lock ranking code.
+	locksHeldLen int
+	locksHeld    [10]heldLockInfo
 }
 
 type p struct {
diff --git a/src/runtime/rwmutex.go b/src/runtime/rwmutex.go
index a6da4c9..7713c3f 100644
--- a/src/runtime/rwmutex.go
+++ b/src/runtime/rwmutex.go
@@ -39,7 +39,7 @@
 	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
 		// A writer is pending. Park on the reader queue.
 		systemstack(func() {
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			if rw.readerPass > 0 {
 				// Writer finished.
 				rw.readerPass -= 1
@@ -67,7 +67,7 @@
 		// A writer is pending.
 		if atomic.Xadd(&rw.readerWait, -1) == 0 {
 			// The last reader unblocks the writer.
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			w := rw.writer.ptr()
 			if w != nil {
 				notewakeup(&w.park)
@@ -81,12 +81,12 @@
 // lock locks rw for writing.
 func (rw *rwmutex) lock() {
 	// Resolve competition with other writers and stick to our P.
-	lock(&rw.wLock)
+	lockWithRank(&rw.wLock, lockRankRwmutexW)
 	m := getg().m
 	// Announce that there is a pending writer.
 	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
 	// Wait for any active readers to complete.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
 		// Wait for reader to wake us up.
 		systemstack(func() {
@@ -108,7 +108,7 @@
 		throw("unlock of unlocked rwmutex")
 	}
 	// Unblock blocked readers.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	for rw.readers.ptr() != nil {
 		reader := rw.readers.ptr()
 		rw.readers = reader.schedlink
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 9bfd4f9..f94c1aa 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -129,7 +129,7 @@
 		s.acquiretime = t0
 	}
 	for {
-		lock(&root.lock)
+		lockWithRank(&root.lock, lockRankRoot)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
 		atomic.Xadd(&root.nwait, 1)
 		// Check cansemacquire to avoid missed wakeup.
@@ -168,7 +168,7 @@
 	}
 
 	// Harder case: search for a waiter and wake it.
-	lock(&root.lock)
+	lockWithRank(&root.lock, lockRankRoot)
 	if atomic.Load(&root.nwait) == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
@@ -486,7 +486,7 @@
 // notifyListAdd was called, it returns immediately. Otherwise, it blocks.
 //go:linkname notifyListWait sync.runtime_notifyListWait
 func notifyListWait(l *notifyList, t uint32) {
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Return right away if this ticket has already been notified.
 	if less(t, l.notify) {
@@ -528,7 +528,7 @@
 
 	// Pull the list out into a local variable, waiters will be readied
 	// outside the lock.
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 	s := l.head
 	l.head = nil
 	l.tail = nil
@@ -558,7 +558,7 @@
 		return
 	}
 
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Re-check under the lock if we need to do anything.
 	t := l.notify
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index e72a75c..b5efac0 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -91,7 +91,7 @@
 
 	// The stack guard is a pointer this many bytes above the
 	// bottom of the stack.
-	_StackGuard = 896*sys.StackGuardMultiplier + _StackSystem
+	_StackGuard = 928*sys.StackGuardMultiplier + _StackSystem
 
 	// After a stack split check the SP is allowed to be this
 	// many bytes below the stack guard. This saves an instruction
@@ -161,9 +161,11 @@
 	}
 	for i := range stackpool {
 		stackpool[i].item.span.init()
+		lockInit(&stackpool[i].item.mu, lockRankStackpool)
 	}
 	for i := range stackLarge.free {
 		stackLarge.free[i].init()
+		lockInit(&stackLarge.lock, lockRankStackLarge)
 	}
 }
 
@@ -182,6 +184,7 @@
 func stackpoolalloc(order uint8) gclinkptr {
 	list := &stackpool[order].item.span
 	s := list.first
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
 		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
@@ -389,6 +392,8 @@
 		}
 		unlock(&stackLarge.lock)
 
+		lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+
 		if s == nil {
 			// Allocate a new stack from the heap.
 			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 9aa9fac..33062da 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -869,6 +869,7 @@
 
 	tab.mem.drop()
 	*tab = traceStackTable{}
+	lockInit(&((*tab).lock), lockRankTraceStackTab)
 }
 
 type traceFrame struct {
diff --git a/src/sync/runtime.go b/src/sync/runtime.go
index 3ad44e7..de2b0a3 100644
--- a/src/sync/runtime.go
+++ b/src/sync/runtime.go
@@ -28,16 +28,6 @@
 // runtime_Semrelease's caller.
 func runtime_Semrelease(s *uint32, handoff bool, skipframes int)
 
-// Approximation of notifyList in runtime/sema.go. Size and alignment must
-// agree.
-type notifyList struct {
-	wait   uint32
-	notify uint32
-	lock   uintptr
-	head   unsafe.Pointer
-	tail   unsafe.Pointer
-}
-
 // See runtime/sema.go for documentation.
 func runtime_notifyListAdd(l *notifyList) uint32
 
diff --git a/src/sync/runtime2.go b/src/sync/runtime2.go
new file mode 100644
index 0000000..931edad
--- /dev/null
+++ b/src/sync/runtime2.go
@@ -0,0 +1,15 @@
+// +build !goexperiment.staticlockranking
+
+package sync
+
+import "unsafe"
+
+// Approximation of notifyList in runtime/sema.go. Size and alignment must
+// agree.
+type notifyList struct {
+	wait   uint32
+	notify uint32
+	lock   uintptr // key field of the mutex
+	head   unsafe.Pointer
+	tail   unsafe.Pointer
+}
diff --git a/src/sync/runtime2_lockrank.go b/src/sync/runtime2_lockrank.go
new file mode 100644
index 0000000..5a68e90
--- /dev/null
+++ b/src/sync/runtime2_lockrank.go
@@ -0,0 +1,18 @@
+// +build goexperiment.staticlockranking
+
+package sync
+
+import "unsafe"
+
+// Approximation of notifyList in runtime/sema.go. Size and alignment must
+// agree.
+type notifyList struct {
+	wait   uint32
+	notify uint32
+	rank   int     // rank field of the mutex
+	pad    int     // pad field of the mutex
+	lock   uintptr // key field of the mutex
+
+	head unsafe.Pointer
+	tail unsafe.Pointer
+}
diff --git a/test/nosplit.go b/test/nosplit.go
index ad19d8a..a3f2a9f 100644
--- a/test/nosplit.go
+++ b/test/nosplit.go
@@ -312,17 +312,17 @@
 				name := m[1]
 				size, _ := strconv.Atoi(m[2])
 
-				// The limit was originally 128 but is now 768 (896-128).
+				// The limit was originally 128 but is now 800 (928-128).
 				// Instead of rewriting the test cases above, adjust
 				// the first stack frame to use up the extra bytes.
 				if i == 0 {
-					size += (896 - 128) - 128
+					size += (928 - 128) - 128
 					// Noopt builds have a larger stackguard.
 					// See ../src/cmd/dist/buildruntime.go:stackGuardMultiplier
 					// This increase is included in objabi.StackGuard
 					for _, s := range strings.Split(os.Getenv("GO_GCFLAGS"), " ") {
 						if s == "-N" {
-							size += 896
+							size += 928
 						}
 					}
 				}