runtime: fix profiling on windows/ARM

Fix profiling handler to get the correct g for the m being profiled.
Store a pointer to the TLS slot holding g in the thread's m. This
enables the profiling handler to get the current g for the thread,
even if the thread is executing external code or system code.

Updates #26148

Signed-off-by: Jordan Rhee <jordanrh@microsoft.com>

Change-Id: Ie061284c12341c76c7d96cc0c2d5bac969230829
Reviewed-on: https://go-review.googlesource.com/c/153718
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 03dd95b..5870a34 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -872,24 +872,14 @@
 	default:
 		panic("unsupported architecture")
 	case "arm":
-		// TODO(jordanrh1): this is incorrect when Go is executing
-		// on the system or signal stacks because curg returns
-		// the current user g. The true g is stored in thread
-		// local storage, which we cannot access from another CPU.
-		// We cannot pull R10 from the thread context because
-		// it might be executing C code, in which case R10
-		// would not be g.
-		gp = mp.curg
+		tls := &mp.tls[0]
+		gp = **((***g)(unsafe.Pointer(tls)))
 	case "386", "amd64":
 		tls := &mp.tls[0]
 		gp = *((**g)(unsafe.Pointer(tls)))
 	}
 
-	if gp == nil {
-		sigprofNonGoPC(r.ip())
-	} else {
-		sigprof(r.ip(), r.sp(), 0, gp, mp)
-	}
+	sigprof(r.ip(), r.sp(), 0, gp, mp)
 }
 
 func profileloop1(param uintptr) uint32 {
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 60a85b8..60be74b 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -362,6 +362,9 @@
 	MOVW	R0, g_m(g)
 	BL	runtime·save_g(SB)
 
+	// do per-thread TLS initialization
+	BL	runtime·init_thread_tls(SB)
+
 	// Layout new m scheduler stack on os stack.
 	MOVW	R13, R0
 	MOVW	R0, g_stack+stack_hi(g)
@@ -595,3 +598,95 @@
 	B	runtime·nanotimeQPC(SB)		// tail call
 	RET
 
+// save_g saves the g register (R10) into thread local memory
+// so that we can call externally compiled
+// ARM code that will overwrite those registers.
+// NOTE: runtime.gogo assumes that R1 is preserved by this function.
+//       runtime.mcall assumes this function only clobbers R0 and R11.
+// Returns with g in R0.
+// Save the value in the _TEB->TlsSlots array.
+// Effectively implements TlsSetValue().
+// tls_g stores the TLS slot allocated TlsAlloc().
+TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0
+	MRC	15, 0, R0, C13, C0, 2
+	ADD	$0xe10, R0
+	MOVW 	$runtime·tls_g(SB), R11
+	MOVW	(R11), R11
+	MOVW	g, R11<<2(R0)
+	MOVW	g, R0	// preserve R0 across call to setg<>
+	RET
+
+// load_g loads the g register from thread-local memory,
+// for use after calling externally compiled
+// ARM code that overwrote those registers.
+// Get the value from the _TEB->TlsSlots array.
+// Effectively implements TlsGetValue().
+TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0
+	MRC	15, 0, R0, C13, C0, 2
+	ADD	$0xe10, R0
+	MOVW 	$runtime·tls_g(SB), g
+	MOVW	(g), g
+	MOVW	g<<2(R0), g
+	RET
+
+// This is called from rt0_go, which runs on the system stack
+// using the initial stack allocated by the OS.
+// It calls back into standard C using the BL below.
+// To do that, the stack pointer must be 8-byte-aligned.
+TEXT runtime·_initcgo(SB),NOSPLIT|NOFRAME,$0
+	MOVM.DB.W [R4, R14], (R13)	// push {r4, lr}
+
+	// Ensure stack is 8-byte aligned before calling C code
+	MOVW	R13, R4
+	BIC	$0x7, R13
+
+	// Allocate a TLS slot to hold g across calls to external code
+	MOVW 	$runtime·_TlsAlloc(SB), R0
+	MOVW	(R0), R0
+	BL	(R0)
+
+	// Assert that slot is less than 64 so we can use _TEB->TlsSlots
+	CMP	$64, R0
+	MOVW	$runtime·abort(SB), R1
+	BL.GE	(R1)
+
+	// Save Slot into tls_g
+	MOVW 	$runtime·tls_g(SB), R1
+	MOVW	R0, (R1)
+
+	BL	runtime·init_thread_tls(SB)
+
+	MOVW	R4, R13
+	MOVM.IA.W (R13), [R4, R15]	// pop {r4, pc}
+
+// void init_thread_tls()
+//
+// Does per-thread TLS initialization. Saves a pointer to the TLS slot
+// holding G, in the current m.
+//
+//     g->m->tls[0] = &_TEB->TlsSlots[tls_g]
+//
+// The purpose of this is to enable the profiling handler to get the
+// current g associated with the thread. We cannot use m->curg because curg
+// only holds the current user g. If the thread is executing system code or
+// external code, m->curg will be NULL. The thread's TLS slot always holds
+// the current g, so save a reference to this location so the profiling
+// handler can get the real g from the thread's m.
+//
+// Clobbers R0-R3
+TEXT runtime·init_thread_tls(SB),NOSPLIT|NOFRAME,$0
+	// compute &_TEB->TlsSlots[tls_g]
+	MRC	15, 0, R0, C13, C0, 2
+	ADD	$0xe10, R0
+	MOVW 	$runtime·tls_g(SB), R1
+	MOVW	(R1), R1
+	MOVW	R1<<2, R1
+	ADD	R1, R0
+
+	// save in g->m->tls[0]
+	MOVW	g_m(g), R1
+	MOVW	R0, m_tls(R1)
+	RET
+
+// Holds the TLS Slot, which was allocated by TlsAlloc()
+GLOBL runtime·tls_g+0(SB), NOPTR, $4
diff --git a/src/runtime/tls_arm.s b/src/runtime/tls_arm.s
index e2c945d..400c16a 100644
--- a/src/runtime/tls_arm.s
+++ b/src/runtime/tls_arm.s
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !windows
+
 #include "go_asm.h"
 #include "go_tls.h"
 #include "funcdata.h"
@@ -23,9 +25,6 @@
 #ifdef GOOS_darwin
 #define TLSG_IS_VARIABLE
 #endif
-#ifdef GOOS_windows
-#define TLSG_IS_VARIABLE
-#endif
 
 // save_g saves the g register into pthread-provided
 // thread-local memory, so that we can call externally compiled
@@ -39,17 +38,6 @@
 	MOVW	g, R0 // preserve R0 across call to setg<>
 	RET
 #else
-#ifdef GOOS_windows
-	// Save the value in the _TEB->TlsSlots array.
-	// Effectively implements TlsSetValue().
-	MRC	15, 0, R0, C13, C0, 2
-	ADD	$0xe10, R0
-	MOVW 	$runtime·tls_g(SB), R11
-	MOVW	(R11), R11
-	MOVW	g, R11<<2(R0)
-	MOVW	g, R0	// preserve R0 accross call to setg<>
-	RET
-#else
 	// If the host does not support MRC the linker will replace it with
 	// a call to runtime.read_tls_fallback which jumps to __kuser_get_tls.
 	// The replacement function saves LR in R11 over the call to read_tls_fallback.
@@ -61,7 +49,6 @@
 	MOVW	g, R0 // preserve R0 across call to setg<>
 	RET
 #endif
-#endif
 
 // load_g loads the g register from pthread-provided
 // thread-local memory, for use after calling externally compiled
@@ -71,16 +58,6 @@
 	// nothing to do as nacl/arm does not use TLS at all.
 	RET
 #else
-#ifdef GOOS_windows
-	// Get the value from the _TEB->TlsSlots array.
-	// Effectively implements TlsGetValue().
-	MRC	15, 0, R0, C13, C0, 2
-	ADD	$0xe10, R0
-	MOVW 	$runtime·tls_g(SB), g
-	MOVW	(g), g
-	MOVW	g<<2(R0), g
-	RET
-#else
 	// See save_g
 	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
 	BIC $3, R0 // Darwin/ARM might return unaligned pointer
@@ -89,7 +66,6 @@
 	MOVW	0(R0), g
 	RET
 #endif
-#endif
 
 // This is called from rt0_go, which runs on the system stack
 // using the initial stack allocated by the OS.
@@ -102,20 +78,6 @@
 // Declare a dummy word ($4, not $0) to make sure the
 // frame is 8 bytes and stays 8-byte-aligned.
 TEXT runtime·_initcgo(SB),NOSPLIT,$4
-#ifdef GOOS_windows
-	MOVW	R13, R4
-	BIC	$0x7, R13
-	MOVW 	$runtime·_TlsAlloc(SB), R0
-	MOVW	(R0), R0
-	BL	(R0)
-	// Assert that slot is less than 64 so we can use _TEB->TlsSlots
-	CMP	$64, R0
-	MOVW	$runtime·abort(SB), R1
-	BL.GE	(R1)
-	MOVW 	$runtime·tls_g(SB), R1
-	MOVW	R0, (R1)
-	MOVW	R4, R13
-#else
 #ifndef GOOS_nacl
 	// if there is an _cgo_init, call it.
 	MOVW	_cgo_init(SB), R4
@@ -131,8 +93,7 @@
 	MOVW	$setg_gcc<>(SB), R1 	// arg 1: setg
 	MOVW	g, R0 			// arg 0: G
 	BL	(R4) // will clobber R0-R3
-#endif // GOOS_nacl
-#endif // GOOS_windows
+#endif
 nocgo:
 	RET