runtime: remove write barriers from newstack, gogo

Currently, newstack and gogo have write barriers for maintaining the
context register saved in g.sched.ctxt. This is troublesome, because
newstack can be called from go:nowritebarrierrec places that can't
allow write barriers. It happens to be benign because g.sched.ctxt
will always be nil on entry to newstack *and* it so happens the
incoming ctxt will also always be nil in these contexts (I
think/hope), but this is playing with fire. It's also desirable to
mark newstack go:nowritebarrierrec to prevent any other, non-benign
write barriers from creeping in, but we can't do that right now
because of this one write barrier.

Fix all of this by observing that g.sched.ctxt is really just a saved
live pointer register. Hence, we can shade it when we scan g's stack
and otherwise move it back and forth between the actual context
register and g.sched.ctxt without write barriers. This means we can
save it in morestack along with all of the other g.sched, eliminate
the save from newstack along with its troublesome write barrier, and
eliminate the shenanigans in gogo to invoke the write barrier when
restoring it.

Once we've done all of this, we can mark newstack
go:nowritebarrierrec.

Fixes #22385.
For #22460.

Change-Id: I43c24958e3f6785b53c1350e1e83c2844e0d1522
Reviewed-on: https://go-review.googlesource.com/72553
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index ef82756..15d9ce9 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -362,18 +362,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $8-4
 	MOVL	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVL	gobuf_ctxt(BX), DX
-	TESTL	DX, DX
-	JZ	nilctxt
-	LEAL	gobuf_ctxt(BX), AX
-	MOVL	AX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVL	buf+0(FP), BX
-
-nilctxt:
 	MOVL	gobuf_g(BX), DX
 	MOVL	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -536,7 +524,7 @@
 	MOVL	SI, (g_sched+gobuf_g)(SI)
 	LEAL	4(SP), AX	// f's SP
 	MOVL	AX, (g_sched+gobuf_sp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BP
@@ -544,10 +532,8 @@
 	MOVL	(g_sched+gobuf_sp)(BP), AX
 	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
 	MOVL	AX, SP
-	PUSHL	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVL	$0, 0x1003	// crash if newstack returns
-	POPL	DX	// keep balance check happy
 	RET
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 7c5e8e9..2ac879c 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -304,18 +304,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVQ	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVQ	gobuf_ctxt(BX), AX
-	TESTQ	AX, AX
-	JZ	nilctxt
-	LEAQ	gobuf_ctxt(BX), AX
-	MOVQ	AX, 0(SP)
-	MOVQ	$0, 8(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVQ	buf+0(FP), BX
-
-nilctxt:
 	MOVQ	gobuf_g(BX), DX
 	MOVQ	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -482,16 +470,14 @@
 	LEAQ	8(SP), AX // f's SP
 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
 	MOVQ	BP, (g_sched+gobuf_bp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVQ	m_g0(BX), BX
 	MOVQ	BX, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(BX), SP
-	PUSHQ	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVQ	$0, 0x1003	// crash if newstack returns
-	POPQ	DX	// keep balance check happy
 	RET
 
 // morestack but not preserving ctxt.
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index c80a563..b7fcf23 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -198,18 +198,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $8-4
 	MOVL	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVL	gobuf_ctxt(BX), DX
-	TESTL	DX, DX
-	JZ	nilctxt
-	LEAL	gobuf_ctxt(BX), AX
-	MOVL	AX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVL	buf+0(FP), BX
-
-nilctxt:
 	MOVL	gobuf_g(BX), DX
 	MOVL	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -368,16 +356,14 @@
 	MOVL	SI, (g_sched+gobuf_g)(SI)
 	LEAL	8(SP), AX // f's SP
 	MOVL	AX, (g_sched+gobuf_sp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BX
 	MOVL	BX, g(CX)
 	MOVL	(g_sched+gobuf_sp)(BX), SP
-	PUSHQ	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVL	$0, 0x1003	// crash if newstack returns
-	POPQ	DX	// keep balance check happy
 	RET
 
 // morestack trampolines
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 47fa565..caa96cc 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -227,19 +227,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB),NOSPLIT,$8-4
 	MOVW	buf+0(FP), R1
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVW	gobuf_ctxt(R1), R0
-	CMP	$0, R0
-	B.EQ	nilctxt
-	MOVW	$gobuf_ctxt(R1), R0
-	MOVW	R0, 4(R13)
-	MOVW	$0, R0
-	MOVW	R0, 8(R13)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVW	buf+0(FP), R1
-
-nilctxt:
 	MOVW	gobuf_g(R1), R0
 	BL	setg<>(SB)
 
@@ -412,7 +399,7 @@
 	MOVW	R13, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_pc)(g)
 	MOVW	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVW	R7, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -426,8 +413,7 @@
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R13
 	MOVW	$0, R0
-	MOVW.W	R0, -8(R13)	// create a call frame on g0
-	MOVW	R7, 4(R13)	// ctxt argument
+	MOVW.W  R0, -4(R13)	// create a call frame on g0 (saved LR)
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index e4b2c37..b2aff1a 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -122,18 +122,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $24-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R0
-	CMP	$0, R0
-	BEQ	nilctxt
-	MOVD	$gobuf_ctxt(R5), R0
-	MOVD	R0, 8(RSP)
-	MOVD	ZR, 16(RSP)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g
 	BL	runtime·save_g(SB)
 
@@ -289,7 +277,7 @@
 	MOVD	R0, (g_sched+gobuf_sp)(g)
 	MOVD	LR, (g_sched+gobuf_pc)(g)
 	MOVD	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R26, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's callers.
@@ -303,8 +291,7 @@
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP
-	MOVD.W	$0, -16(RSP)	// create a call frame on g0
-	MOVD	R26, 8(RSP)	// ctxt argument
+	MOVD.W	$0, -16(RSP)	// create a call frame on g0 (saved LR; keep 16-aligned)
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 4902d04..3510853 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -108,17 +108,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVV	buf+0(FP), R3
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVV	gobuf_ctxt(R3), R1
-	BEQ	R1, nilctxt
-	MOVV	$gobuf_ctxt(R3), R1
-	MOVV	R1, 8(R29)
-	MOVV	R0, 16(R29)
-	JAL	runtime·writebarrierptr_prewrite(SB)
-	MOVV	buf+0(FP), R3
-
-nilctxt:
 	MOVV	gobuf_g(R3), g	// make sure g is not nil
 	JAL	runtime·save_g(SB)
 
@@ -260,7 +249,7 @@
 	MOVV	R29, (g_sched+gobuf_sp)(g)
 	MOVV	R31, (g_sched+gobuf_pc)(g)
 	MOVV	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVV	REGCTXT, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -273,9 +262,8 @@
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R29
 	// Create a stack frame on g0 to call newstack.
-	MOVV	R0, -16(R29)	// Zero saved LR in frame
-	ADDV	$-16, R29
-	MOVV	REGCTXT, 8(R29)	// ctxt argument
+	MOVV	R0, -8(R29)	// Zero saved LR in frame
+	ADDV	$-8, R29
 	JAL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 82e01b0..334f259 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -109,17 +109,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB),NOSPLIT,$8-4
 	MOVW	buf+0(FP), R3
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVW	gobuf_ctxt(R3), R1
-	BEQ	R1, nilctxt
-	MOVW	$gobuf_ctxt(R3), R1
-	MOVW	R1, 4(R29)
-	MOVW	R0, 8(R29)
-	JAL	runtime·writebarrierptr_prewrite(SB)
-	MOVW	buf+0(FP), R3
-
-nilctxt:
 	MOVW	gobuf_g(R3), g	// make sure g is not nil
 	JAL	runtime·save_g(SB)
 
@@ -261,7 +250,7 @@
 	MOVW	R29, (g_sched+gobuf_sp)(g)
 	MOVW	R31, (g_sched+gobuf_pc)(g)
 	MOVW	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVW	REGCTXT, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -274,9 +263,8 @@
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R29
 	// Create a stack frame on g0 to call newstack.
-	MOVW	R0, -8(R29)	// Zero saved LR in frame
-	ADDU	$-8, R29
-	MOVW	REGCTXT, 4(R29)	// ctxt argument
+	MOVW	R0, -4(R29)	// Zero saved LR in frame
+	ADDU	$-4, R29
 	JAL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 40ad101..2f2a4a7 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -133,18 +133,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R3
-	CMP	R0, R3
-	BEQ	nilctxt
-	MOVD	$gobuf_ctxt(R5), R3
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	R0, FIXED_FRAME+8(R1)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g	// make sure g is not nil
 	BL	runtime·save_g(SB)
 
@@ -317,7 +305,7 @@
 	MOVD	LR, R8
 	MOVD	R8, (g_sched+gobuf_pc)(g)
 	MOVD	R5, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R11, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -329,8 +317,7 @@
 	MOVD	m_g0(R7), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R1
-	MOVDU   R0, -(FIXED_FRAME+8)(R1)	// create a call frame on g0
-	MOVD	R11, FIXED_FRAME+0(R1)	// ctxt argument
+	MOVDU   R0, -(FIXED_FRAME+0)(R1)	// create a call frame on g0
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 241be17..524b866 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -116,17 +116,6 @@
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R1
-	CMPBEQ	R1, $0, nilctxt
-	MOVD	$gobuf_ctxt(R5), R1
-	MOVD	R1, 8(R15)
-	MOVD	R0, 16(R15)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g	// make sure g is not nil
 	BL	runtime·save_g(SB)
 
@@ -272,7 +261,7 @@
 	MOVD	LR, R8
 	MOVD	R8, (g_sched+gobuf_pc)(g)
 	MOVD	R5, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R12, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -285,9 +274,8 @@
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R15
 	// Create a stack frame on g0 to call newstack.
-	MOVD	$0, -16(R15)	// Zero saved LR in frame
-	SUB	$16, R15
-	MOVD	R12, 8(R15)	// ctxt argument
+	MOVD	$0, -8(R15)	// Zero saved LR in frame
+	SUB	$8, R15
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index ed256ef..ce697e5 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -770,6 +770,13 @@
 		shrinkstack(gp)
 	}
 
+	// Scan the saved context register. This is effectively a live
+	// register that gets moved back and forth between the
+	// register and sched.ctxt without a write barrier.
+	if gp.sched.ctxt != nil {
+		scanblock(uintptr(unsafe.Pointer(&gp.sched.ctxt)), sys.PtrSize, &oneptrmask[0], gcw)
+	}
+
 	// Scan the stack.
 	var cache pcvalueCache
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index ca79616..a79faba 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -254,17 +254,19 @@
 	// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
 	//
 	// ctxt is unusual with respect to GC: it may be a
-	// heap-allocated funcval so write require a write barrier,
-	// but gobuf needs to be cleared from assembly. We take
-	// advantage of the fact that the only path that uses a
-	// non-nil ctxt is morestack. As a result, gogo is the only
-	// place where it may not already be nil, so gogo uses an
-	// explicit write barrier. Everywhere else that resets the
-	// gobuf asserts that ctxt is already nil.
+	// heap-allocated funcval, so GC needs to track it, but it
+	// needs to be set and cleared from assembly, where it's
+	// difficult to have write barriers. However, ctxt is really a
+	// saved, live register, and we only ever exchange it between
+	// the real register and the gobuf. Hence, we treat it as a
+	// root during stack scanning, which means assembly that saves
+	// and restores it doesn't need write barriers. It's still
+	// typed as a pointer so that any other writes from Go get
+	// write barriers.
 	sp   uintptr
 	pc   uintptr
 	g    guintptr
-	ctxt unsafe.Pointer // this has to be a pointer so that gc scans it
+	ctxt unsafe.Pointer
 	ret  sys.Uintreg
 	lr   uintptr
 	bp   uintptr // for GOEXPERIMENT=framepointer
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 4e60e80..89458b7 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -913,9 +913,12 @@
 // g->atomicstatus will be Grunning or Gscanrunning upon entry.
 // If the GC is trying to stop this g then it will set preemptscan to true.
 //
-// ctxt is the value of the context register on morestack. newstack
-// will write it to g.sched.ctxt.
-func newstack(ctxt unsafe.Pointer) {
+// This must be nowritebarrierrec because it can be called as part of
+// stack growth from other nowritebarrierrec functions, but the
+// compiler doesn't check this.
+//
+//go:nowritebarrierrec
+func newstack() {
 	thisg := getg()
 	// TODO: double check all gp. shouldn't be getg().
 	if thisg.m.morebuf.g.ptr().stackguard0 == stackFork {
@@ -929,9 +932,6 @@
 	}
 
 	gp := thisg.m.curg
-	// Write ctxt to gp.sched. We do this here instead of in
-	// morestack so it has the necessary write barrier.
-	gp.sched.ctxt = ctxt
 
 	if thisg.m.curg.throwsplit {
 		// Update syscallsp, syscallpc in case traceback uses them.