runtime: support preemption on windows/{386,amd64}

This implements preemptM on Windows using SuspendThead and
ResumeThread.

Unlike on POSIX platforms, preemptM on Windows happens synchronously.
This means we need a make a few other tweaks to suspendG:

1. We need to CAS the G back to _Grunning before doing the preemptM,
   or there's a good chance we'll just catch the G spinning on its
   status in the runtime, which won't be preemptible.

2. We need to rate-limit preemptM attempts. Otherwise, if the first
   attempt catches the G at a non-preemptible point, the busy loop in
   suspendG may hammer it so hard that it never makes it past that
   non-preemptible point.

Updates #10958, #24543.

Change-Id: Ie53b098811096f7e45d864afd292dc9e999ce226
Reviewed-on: https://go-review.googlesource.com/c/go/+/204340
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 00f4c6e..cf5837c 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -6,6 +6,7 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -32,6 +33,7 @@
 //go:cgo_import_dynamic runtime._GetSystemDirectoryA GetSystemDirectoryA%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetThreadContext SetThreadContext%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._PostQueuedCompletionStatus PostQueuedCompletionStatus%4 "kernel32.dll"
@@ -79,6 +81,7 @@
 	_GetSystemInfo,
 	_GetSystemTimeAsFileTime,
 	_GetThreadContext,
+	_SetThreadContext,
 	_LoadLibraryW,
 	_LoadLibraryA,
 	_PostQueuedCompletionStatus,
@@ -539,6 +542,11 @@
 
 //go:nosplit
 func exit(code int32) {
+	// Disallow thread suspension for preemption. Otherwise,
+	// ExitProcess and SuspendThread can race: SuspendThread
+	// queues a suspension request for this thread, ExitProcess
+	// kills the suspending thread, and then this thread suspends.
+	lock(&suspendLock)
 	atomic.Store(&exiting, 1)
 	stdcall1(_ExitProcess, uintptr(code))
 }
@@ -1003,21 +1011,24 @@
 	r.contextflags = _CONTEXT_CONTROL
 	stdcall2(_GetThreadContext, thread, uintptr(unsafe.Pointer(r)))
 
-	var gp *g
-	switch GOARCH {
-	default:
-		panic("unsupported architecture")
-	case "arm":
-		tls := &mp.tls[0]
-		gp = **((***g)(unsafe.Pointer(tls)))
-	case "386", "amd64":
-		tls := &mp.tls[0]
-		gp = *((**g)(unsafe.Pointer(tls)))
-	}
+	gp := gFromTLS(mp)
 
 	sigprof(r.ip(), r.sp(), r.lr(), gp, mp)
 }
 
+func gFromTLS(mp *m) *g {
+	switch GOARCH {
+	case "arm":
+		tls := &mp.tls[0]
+		return **((***g)(unsafe.Pointer(tls)))
+	case "386", "amd64":
+		tls := &mp.tls[0]
+		return *((**g)(unsafe.Pointer(tls)))
+	}
+	throw("unsupported architecture")
+	return nil
+}
+
 func profileloop1(param uintptr) uint32 {
 	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
 
@@ -1081,10 +1092,95 @@
 	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
 }
 
-const preemptMSupported = false
+const preemptMSupported = GOARCH != "arm"
+
+// suspendLock protects simultaneous SuspendThread operations from
+// suspending each other.
+var suspendLock mutex
 
 func preemptM(mp *m) {
-	// Not currently supported.
-	//
-	// TODO: Use SuspendThread/GetThreadContext/ResumeThread
+	if GOARCH == "arm" {
+		// TODO: Implement call injection
+		return
+	}
+
+	if mp == getg().m {
+		throw("self-preempt")
+	}
+
+	// Acquire our own handle to mp's thread.
+	lock(&mp.threadLock)
+	if mp.thread == 0 {
+		// The M hasn't been minit'd yet (or was just unminit'd).
+		unlock(&mp.threadLock)
+		atomic.Xadd(&mp.preemptGen, 1)
+		return
+	}
+	var thread uintptr
+	stdcall7(_DuplicateHandle, currentProcess, mp.thread, currentProcess, uintptr(unsafe.Pointer(&thread)), 0, 0, _DUPLICATE_SAME_ACCESS)
+	unlock(&mp.threadLock)
+
+	// Prepare thread context buffer.
+	var c *context
+	cbuf := make([]byte, unsafe.Sizeof(*c)+15)
+	// Align Context to 16 bytes.
+	c = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&cbuf[15]))) &^ 15))
+	c.contextflags = _CONTEXT_CONTROL
+
+	// Serialize thread suspension. SuspendThread is asynchronous,
+	// so it's otherwise possible for two threads to suspend each
+	// other and deadlock. We must hold this lock until after
+	// GetThreadContext, since that blocks until the thread is
+	// actually suspended.
+	lock(&suspendLock)
+
+	// Suspend the thread.
+	if int32(stdcall1(_SuspendThread, thread)) == -1 {
+		unlock(&suspendLock)
+		stdcall1(_CloseHandle, thread)
+		// The thread no longer exists. This shouldn't be
+		// possible, but just acknowledge the request.
+		atomic.Xadd(&mp.preemptGen, 1)
+		return
+	}
+
+	// We have to be very careful between this point and once
+	// we've shown mp is at an async safe-point. This is like a
+	// signal handler in the sense that mp could have been doing
+	// anything when we stopped it, including holding arbitrary
+	// locks.
+
+	// We have to get the thread context before inspecting the M
+	// because SuspendThread only requests a suspend.
+	// GetThreadContext actually blocks until it's suspended.
+	stdcall2(_GetThreadContext, thread, uintptr(unsafe.Pointer(c)))
+
+	unlock(&suspendLock)
+
+	// Does it want a preemption and is it safe to preempt?
+	gp := gFromTLS(mp)
+	if wantAsyncPreempt(gp) && isAsyncSafePoint(gp, c.ip(), c.sp(), c.lr()) {
+		// Inject call to asyncPreempt
+		targetPC := funcPC(asyncPreempt)
+		switch GOARCH {
+		default:
+			throw("unsupported architecture")
+		case "386", "amd64":
+			// Make it look like the thread called targetPC.
+			pc := c.ip()
+			sp := c.sp()
+			sp -= sys.PtrSize
+			*(*uintptr)(unsafe.Pointer(sp)) = pc
+			c.set_sp(sp)
+			c.set_ip(targetPC)
+		}
+
+		stdcall2(_SetThreadContext, thread, uintptr(unsafe.Pointer(c)))
+	}
+
+	// Acknowledge the preemption.
+	atomic.Xadd(&mp.preemptGen, 1)
+
+	stdcall1(_ResumeThread, thread)
+	stdcall1(_CloseHandle, thread)
 }
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index f154614..60e1bce 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -118,6 +118,7 @@
 	stopped := false
 	var asyncM *m
 	var asyncGen uint32
+	var nextPreemptM int64
 	for i := 0; ; i++ {
 		switch s := readgstatus(gp); s {
 		default:
@@ -205,14 +206,32 @@
 			gp.preempt = true
 			gp.stackguard0 = stackPreempt
 
-			// Send asynchronous preemption.
-			asyncM = gp.m
-			asyncGen = atomic.Load(&asyncM.preemptGen)
-			if preemptMSupported && debug.asyncpreemptoff == 0 {
-				preemptM(asyncM)
-			}
+			// Prepare for asynchronous preemption.
+			asyncM2 := gp.m
+			asyncGen2 := atomic.Load(&asyncM2.preemptGen)
+			needAsync := asyncM != asyncM2 || asyncGen != asyncGen2
+			asyncM = asyncM2
+			asyncGen = asyncGen2
 
 			casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+
+			// Send asynchronous preemption. We do this
+			// after CASing the G back to _Grunning
+			// because preemptM may be synchronous and we
+			// don't want to catch the G just spinning on
+			// its status.
+			if preemptMSupported && debug.asyncpreemptoff == 0 && needAsync {
+				// Rate limit preemptM calls. This is
+				// particularly important on Windows
+				// where preemptM is actually
+				// synchronous and the spin loop here
+				// can lead to live-lock.
+				now := nanotime()
+				if now >= nextPreemptM {
+					nextPreemptM = now + yieldDelay/2
+					preemptM(asyncM)
+				}
+			}
 		}
 
 		// TODO: Don't busy wait. This loop should really only