reflect: add write barriers

Use typedmemmove, typedslicecopy, and adjust reflect.call
to execute the necessary write barriers.

Found with GODEBUG=wbshadow=2 mode.
Eventually that will run automatically, but right now
it still detects other missing write barriers.

Change-Id: Iec5b5b0c1be5589295e28e5228e37f1a92e07742
Reviewed-on: https://go-review.googlesource.com/2312
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/reflect/value.go b/src/reflect/value.go
index c34679d..652dee2 100644
--- a/src/reflect/value.go
+++ b/src/reflect/value.go
@@ -107,7 +107,7 @@
 			// TODO: pass safe boolean from valueInterface so
 			// we don't need to copy if safe==true?
 			c := unsafe_New(t)
-			memmove(c, ptr, t.size)
+			typedmemmove(t, c, ptr)
 			ptr = c
 		}
 		e.word = ptr
@@ -412,7 +412,7 @@
 		addr := unsafe.Pointer(uintptr(args) + off)
 		v = v.assignTo("reflect.Value.Call", targ, addr)
 		if v.flag&flagIndir != 0 {
-			memmove(addr, v.ptr, n)
+			typedmemmove(targ, addr, v.ptr)
 		} else {
 			*(*unsafe.Pointer)(addr) = v.ptr
 		}
@@ -420,7 +420,7 @@
 	}
 
 	// Call.
-	call(fn, args, uint32(frametype.size), uint32(retOffset))
+	call(frametype, fn, args, uint32(frametype.size), uint32(retOffset))
 
 	// For testing; see TestCallMethodJump.
 	if callGC {
@@ -473,7 +473,7 @@
 			// and we cannot let f keep a reference to the stack frame
 			// after this function returns, not even a read-only reference.
 			v.ptr = unsafe_New(typ)
-			memmove(v.ptr, addr, typ.size)
+			typedmemmove(typ, v.ptr, addr)
 			v.flag |= flagIndir
 		} else {
 			v.ptr = *(*unsafe.Pointer)(addr)
@@ -509,7 +509,7 @@
 			off += -off & uintptr(typ.align-1)
 			addr := unsafe.Pointer(uintptr(ptr) + off)
 			if v.flag&flagIndir != 0 {
-				memmove(addr, v.ptr, typ.size)
+				typedmemmove(typ, addr, v.ptr)
 			} else {
 				*(*unsafe.Pointer)(addr) = v.ptr
 			}
@@ -603,10 +603,10 @@
 
 	// Copy in receiver and rest of args.
 	storeRcvr(rcvr, args)
-	memmove(unsafe.Pointer(uintptr(args)+ptrSize), frame, argSize-ptrSize)
+	typedmemmovepartial(frametype, unsafe.Pointer(uintptr(args)+ptrSize), frame, ptrSize, argSize-ptrSize)
 
 	// Call.
-	call(fn, args, uint32(frametype.size), uint32(retOffset))
+	call(frametype, fn, args, uint32(frametype.size), uint32(retOffset))
 
 	// Copy return values. On amd64p32, the beginning of return values
 	// is 64-bit aligned, so the caller's frame layout (which doesn't have
@@ -617,8 +617,11 @@
 	if runtime.GOARCH == "amd64p32" {
 		callerRetOffset = align(argSize-ptrSize, 8)
 	}
-	memmove(unsafe.Pointer(uintptr(frame)+callerRetOffset),
-		unsafe.Pointer(uintptr(args)+retOffset), frametype.size-retOffset)
+	typedmemmovepartial(frametype,
+		unsafe.Pointer(uintptr(frame)+callerRetOffset),
+		unsafe.Pointer(uintptr(args)+retOffset),
+		retOffset,
+		frametype.size-retOffset)
 }
 
 // funcName returns the name of f, for use in error messages.
@@ -1017,7 +1020,7 @@
 		// Copy result so future changes to the map
 		// won't change the underlying value.
 		c := unsafe_New(typ)
-		memmove(c, e, typ.size)
+		typedmemmove(typ, c, e)
 		return Value{typ, c, fl | flagIndir}
 	} else {
 		return Value{typ, *(*unsafe.Pointer)(e), fl}
@@ -1055,7 +1058,7 @@
 			// Copy result so future changes to the map
 			// won't change the underlying value.
 			c := unsafe_New(keyType)
-			memmove(c, key, keyType.size)
+			typedmemmove(keyType, c, key)
 			a[i] = Value{keyType, c, fl | flagIndir}
 		} else {
 			a[i] = Value{keyType, *(*unsafe.Pointer)(key), fl}
@@ -1301,7 +1304,7 @@
 	}
 	x = x.assignTo("reflect.Set", v.typ, target)
 	if x.flag&flagIndir != 0 {
-		memmove(v.ptr, x.ptr, v.typ.size)
+		typedmemmove(v.typ, v.ptr, x.ptr)
 	} else {
 		*(*unsafe.Pointer)(v.ptr) = x.ptr
 	}
@@ -1815,27 +1818,23 @@
 	se := src.typ.Elem()
 	typesMustMatch("reflect.Copy", de, se)
 
-	n := dst.Len()
-	if sn := src.Len(); n > sn {
-		n = sn
+	var ds, ss sliceHeader
+	if dk == Array {
+		ds.Data = dst.ptr
+		ds.Len = dst.Len()
+		ds.Cap = ds.Len
+	} else {
+		ds = *(*sliceHeader)(dst.ptr)
+	}
+	if sk == Array {
+		ss.Data = src.ptr
+		ss.Len = src.Len()
+		ss.Cap = ss.Len
+	} else {
+		ss = *(*sliceHeader)(src.ptr)
 	}
 
-	// Copy via memmove.
-	var da, sa unsafe.Pointer
-	if dk == Array {
-		da = dst.ptr
-	} else {
-		da = (*sliceHeader)(dst.ptr).Data
-	}
-	if src.flag&flagIndir == 0 {
-		sa = unsafe.Pointer(&src.ptr)
-	} else if sk == Array {
-		sa = src.ptr
-	} else {
-		sa = (*sliceHeader)(src.ptr).Data
-	}
-	memmove(da, sa, uintptr(n)*de.Size())
-	return n
+	return typedslicecopy(de.common(), ds, ss)
 }
 
 // A runtimeSelect is a single case passed to rselect.
@@ -2376,7 +2375,7 @@
 	if f&flagAddr != 0 {
 		// indirect, mutable word - make a copy
 		c := unsafe_New(t)
-		memmove(c, ptr, t.size)
+		typedmemmove(t, c, ptr)
 		ptr = c
 		f &^= flagAddr
 	}
@@ -2425,12 +2424,29 @@
 func mapiterkey(it unsafe.Pointer) (key unsafe.Pointer)
 func mapiternext(it unsafe.Pointer)
 func maplen(m unsafe.Pointer) int
-func call(fn, arg unsafe.Pointer, n uint32, retoffset uint32)
+
+// call calls fn with a copy of the n argument bytes pointed at by arg.
+// After fn returns, reflectcall copies n-retoffset result bytes
+// back into arg+retoffset before returning. If copying result bytes back,
+// the caller must pass the argument frame type as argtype, so that
+// call can execute appropriate write barriers during the copy.
+func call(argtype *rtype, fn, arg unsafe.Pointer, n uint32, retoffset uint32)
 
 func ifaceE2I(t *rtype, src interface{}, dst unsafe.Pointer)
 
+// typedmemmove copies a value of type t to dst from src.
 //go:noescape
-func memmove(adst, asrc unsafe.Pointer, n uintptr)
+func typedmemmove(t *rtype, dst, src unsafe.Pointer)
+
+// typedmemmovepartial is like typedmemmove but assumes that
+// dst and src point off bytes into the value and only copies size bytes.
+//go:noescape
+func typedmemmovepartial(t *rtype, dst, src unsafe.Pointer, off, size uintptr)
+
+// typedslicecopy copies a slice of elemType values from src to dst,
+// returning the number of elements copied.
+//go:noescape
+func typedslicecopy(elemType *rtype, dst, src sliceHeader) int
 
 // Dummy annotation marking that the value x escapes,
 // for use in cases where the reflect code is so clever that
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 14e4360..4de7c43 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -329,7 +329,7 @@
 	JMP runtime·morestack(SB)
 
 // reflectcall: call a function with the given argument list
-// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
 // of constant-sized-frame functions to encode a few bits of size in the pc.
 // Caution: ugly multiline assembly macros in your future!
@@ -344,8 +344,8 @@
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	JMP	·reflectcall(SB)
 
-TEXT ·reflectcall(SB), NOSPLIT, $0-16
-	MOVL	argsize+8(FP), CX
+TEXT ·reflectcall(SB), NOSPLIT, $0-20
+	MOVL	argsize+12(FP), CX
 	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
@@ -377,27 +377,37 @@
 	JMP	AX
 
 #define CALLFN(NAME,MAXSIZE)			\
-TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVL	argptr+4(FP), SI;		\
-	MOVL	argsize+8(FP), CX;		\
+	MOVL	argptr+8(FP), SI;		\
+	MOVL	argsize+12(FP), CX;		\
 	MOVL	SP, DI;				\
 	REP;MOVSB;				\
 	/* call function */			\
-	MOVL	f+0(FP), DX;			\
+	MOVL	f+4(FP), DX;			\
 	MOVL	(DX), AX; 			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	CALL	AX;				\
 	/* copy return values back */		\
-	MOVL	argptr+4(FP), DI;		\
-	MOVL	argsize+8(FP), CX;		\
-	MOVL	retoffset+12(FP), BX;		\
+	MOVL	argptr+8(FP), DI;		\
+	MOVL	argsize+12(FP), CX;		\
+	MOVL	retoffset+16(FP), BX;		\
 	MOVL	SP, SI;				\
 	ADDL	BX, DI;				\
 	ADDL	BX, SI;				\
 	SUBL	BX, CX;				\
 	REP;MOVSB;				\
+	/* execute write barrier updates */	\
+	MOVL	argtype+0(FP), DX;		\
+	MOVL	argptr+8(FP), DI;		\
+	MOVL	argsize+12(FP), CX;		\
+	MOVL	retoffset+16(FP), BX;		\
+	MOVL	DX, 0(SP);			\
+	MOVL	DI, 4(SP);			\
+	MOVL	CX, 8(SP);			\
+	MOVL	BX, 12(SP);			\
+	CALL	runtime·callwritebarrier(SB);	\
 	RET
 
 CALLFN(·call16, 16)
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 5a94e11..3e8ccca 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -318,7 +318,7 @@
 	JMP	runtime·morestack(SB)
 
 // reflectcall: call a function with the given argument list
-// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
 // of constant-sized-frame functions to encode a few bits of size in the pc.
 // Caution: ugly multiline assembly macros in your future!
@@ -333,9 +333,10 @@
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	JMP	·reflectcall(SB)
 
-TEXT ·reflectcall(SB), NOSPLIT, $0-24
-	MOVLQZX argsize+16(FP), CX
-	DISPATCH(runtime·call16, 16)
+TEXT ·reflectcall(SB), NOSPLIT, $0-32
+	MOVLQZX argsize+24(FP), CX
+	// NOTE(rsc): No call16, because CALLFN needs four words
+	// of argument space to invoke callwritebarrier.
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -366,29 +367,38 @@
 	JMP	AX
 
 #define CALLFN(NAME,MAXSIZE)			\
-TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVQ	argptr+8(FP), SI;		\
-	MOVLQZX argsize+16(FP), CX;		\
+	MOVQ	argptr+16(FP), SI;		\
+	MOVLQZX argsize+24(FP), CX;		\
 	MOVQ	SP, DI;				\
 	REP;MOVSB;				\
 	/* call function */			\
-	MOVQ	f+0(FP), DX;			\
+	MOVQ	f+8(FP), DX;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	CALL	(DX);				\
 	/* copy return values back */		\
-	MOVQ	argptr+8(FP), DI;		\
-	MOVLQZX	argsize+16(FP), CX;		\
-	MOVLQZX retoffset+20(FP), BX;		\
+	MOVQ	argptr+16(FP), DI;		\
+	MOVLQZX	argsize+24(FP), CX;		\
+	MOVLQZX retoffset+28(FP), BX;		\
 	MOVQ	SP, SI;				\
 	ADDQ	BX, DI;				\
 	ADDQ	BX, SI;				\
 	SUBQ	BX, CX;				\
 	REP;MOVSB;				\
+	/* execute write barrier updates */	\
+	MOVQ	argtype+0(FP), DX;		\
+	MOVQ	argptr+16(FP), DI;		\
+	MOVLQZX	argsize+24(FP), CX;		\
+	MOVLQZX retoffset+28(FP), BX;		\
+	MOVQ	DX, 0(SP);			\
+	MOVQ	DI, 8(SP);			\
+	MOVQ	CX, 16(SP);			\
+	MOVQ	BX, 24(SP);			\
+	CALL	runtime·callwritebarrier(SB);	\
 	RET
 
-CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index 20fb5df..f3752e7 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -290,7 +290,7 @@
 	JMP	runtime·morestack(SB)
 
 // reflectcall: call a function with the given argument list
-// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
 // of constant-sized-frame functions to encode a few bits of size in the pc.
 // Caution: ugly multiline assembly macros in your future!
@@ -305,8 +305,8 @@
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	JMP	·reflectcall(SB)
 
-TEXT ·reflectcall(SB), NOSPLIT, $0-16
-	MOVLQZX argsize+8(FP), CX
+TEXT ·reflectcall(SB), NOSPLIT, $0-20
+	MOVLQZX argsize+12(FP), CX
 	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
@@ -338,26 +338,36 @@
 	JMP	AX
 
 #define CALLFN(NAME,MAXSIZE)			\
-TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVL	argptr+4(FP), SI;		\
-	MOVL	argsize+8(FP), CX;		\
+	MOVL	argptr+8(FP), SI;		\
+	MOVL	argsize+12(FP), CX;		\
 	MOVL	SP, DI;				\
 	REP;MOVSB;				\
 	/* call function */			\
-	MOVL	f+0(FP), DX;			\
+	MOVL	f+4(FP), DX;			\
 	MOVL	(DX), AX;			\
 	CALL	AX;				\
 	/* copy return values back */		\
-	MOVL	argptr+4(FP), DI;		\
-	MOVL	argsize+8(FP), CX;		\
-	MOVL	retoffset+12(FP), BX;		\
+	MOVL	argptr+8(FP), DI;		\
+	MOVL	argsize+12(FP), CX;		\
+	MOVL	retoffset+16(FP), BX;		\
 	MOVL	SP, SI;				\
 	ADDL	BX, DI;				\
 	ADDL	BX, SI;				\
 	SUBL	BX, CX;				\
 	REP;MOVSB;				\
+	/* execute write barrier updates */	\
+	MOVL	argtype+0(FP), DX;		\
+	MOVL	argptr+8(FP), DI;		\
+	MOVL	argsize+12(FP), CX;		\
+	MOVL	retoffset+16(FP), BX;		\
+	MOVL	DX, 0(SP);			\
+	MOVL	DI, 4(SP);			\
+	MOVL	CX, 8(SP);			\
+	MOVL	BX, 12(SP);			\
+	CALL	runtime·callwritebarrier(SB);	\
 	RET
 
 CALLFN(·call16, 16)
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index fdcc0e6..3253942 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -322,7 +322,7 @@
 	B runtime·morestack(SB)
 
 // reflectcall: call a function with the given argument list
-// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
 // of constant-sized-frame functions to encode a few bits of size in the pc.
 // Caution: ugly multiline assembly macros in your future!
@@ -336,8 +336,8 @@
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	B	·reflectcall(SB)
 
-TEXT ·reflectcall(SB),NOSPLIT,$-4-16
-	MOVW	argsize+8(FP), R0
+TEXT ·reflectcall(SB),NOSPLIT,$-4-20
+	MOVW	argsize+12(FP), R0
 	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
@@ -369,11 +369,11 @@
 	B	(R1)
 
 #define CALLFN(NAME,MAXSIZE)			\
-TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVW	argptr+4(FP), R0;		\
-	MOVW	argsize+8(FP), R2;		\
+	MOVW	argptr+8(FP), R0;		\
+	MOVW	argsize+12(FP), R2;		\
 	ADD	$4, SP, R1;			\
 	CMP	$0, R2;				\
 	B.EQ	5(PC);				\
@@ -382,24 +382,37 @@
 	SUB	$1, R2, R2;			\
 	B	-5(PC);				\
 	/* call function */			\
-	MOVW	f+0(FP), R7;			\
+	MOVW	f+4(FP), R7;			\
 	MOVW	(R7), R0;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	BL	(R0);				\
 	/* copy return values back */		\
-	MOVW	argptr+4(FP), R0;		\
-	MOVW	argsize+8(FP), R2;		\
-	MOVW	retoffset+12(FP), R3;		\
+	MOVW	argptr+8(FP), R0;		\
+	MOVW	argsize+12(FP), R2;		\
+	MOVW	retoffset+16(FP), R3;		\
 	ADD	$4, SP, R1;			\
 	ADD	R3, R1;				\
 	ADD	R3, R0;				\
 	SUB	R3, R2;				\
+loop:
 	CMP	$0, R2;				\
-	RET.EQ	;				\
+	B.EQ	end;				\
 	MOVBU.P	1(R1), R5;			\
 	MOVBU.P R5, 1(R0);			\
 	SUB	$1, R2, R2;			\
-	B	-5(PC)				\
+	B	loop;				\
+end:						\
+	/* execute write barrier updates */	\
+	MOVW	argtype+0(FP), R1;		\
+	MOVW	argptr+8(FP), R0;		\
+	MOVW	argsize+12(FP), R2;		\
+	MOVW	retoffset+16(FP), R3;		\
+	MOVW	R1, 4(R13);			\
+	MOVW	R0, 8(R13);			\
+	MOVW	R2, 12(R13);			\
+	MOVW	R3, 16(R13);			\
+	BL	runtime·callwritebarrier(SB);	\
+	RET	
 
 CALLFN(·call16, 16)
 CALLFN(·call32, 32)
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 1360c6e..8ec051d 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -271,7 +271,7 @@
 	BR	runtime·morestack(SB)
 
 // reflectcall: call a function with the given argument list
-// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
 // of constant-sized-frame functions to encode a few bits of size in the pc.
 // Caution: ugly multiline assembly macros in your future!
@@ -288,9 +288,10 @@
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	BR	·reflectcall(SB)
 
-TEXT ·reflectcall(SB), NOSPLIT, $-8-24
-	MOVWZ n+16(FP), R3
-	DISPATCH(runtime·call16, 16)
+TEXT ·reflectcall(SB), NOSPLIT, $-8-32
+	MOVWZ argsize+24(FP), R3
+	// NOTE(rsc): No call16, because CALLFN needs four words
+	// of argument space to invoke callwritebarrier.
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -325,8 +326,8 @@
 TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
-	MOVD	arg+8(FP), R3;			\
-	MOVWZ	n+16(FP), R4;			\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	argsize+24(FP), R4;			\
 	MOVD	R1, R5;				\
 	ADD	$(8-1), R5;			\
 	SUB	$1, R3;				\
@@ -337,15 +338,15 @@
 	MOVBZU	R6, 1(R5);			\
 	BR	-4(PC);				\
 	/* call function */			\
-	MOVD	f+0(FP), R11;			\
+	MOVD	f+8(FP), R11;			\
 	MOVD	(R11), R31;			\
 	MOVD	R31, CTR;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	BL	(CTR);				\
 	/* copy return values back */		\
-	MOVD	arg+8(FP), R3;			\
-	MOVWZ	n+16(FP), R4;			\
-	MOVWZ	retoffset+20(FP), R6;		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
 	MOVD	R1, R5;				\
 	ADD	R6, R5; 			\
 	ADD	R6, R3;				\
@@ -353,11 +354,23 @@
 	ADD	$(8-1), R5;			\
 	SUB	$1, R3;				\
 	ADD	R5, R4;				\
+loop:						\
 	CMP	R5, R4;				\
-	BEQ	4(PC);				\
+	BEQ	end;				\
 	MOVBZU	1(R5), R6;			\
 	MOVBZU	R6, 1(R3);			\
-	BR	-4(PC);				\
+	BR	loop;				\
+end:						\
+	/* execute write barrier updates */	\
+	MOVD	argtype+0(FP), R7;		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
+	MOVD	R7, 8(R1);			\
+	MOVD	R3, 16(R1);			\
+	MOVD	R4, 24(R1);			\
+	MOVD	R6, 32(R1);			\
+	BL	runtime·callwritebarrier(SB);	\
 	RETURN
 
 CALLFN(·call16, 16)
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 52b48c46..dbeea20 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -232,7 +232,12 @@
 	}
 
 	// Invoke callback.
-	reflectcall(unsafe.Pointer(cb.fn), unsafe.Pointer(cb.arg), uint32(cb.argsize), 0)
+	// NOTE(rsc): passing nil for argtype means that the copying of the
+	// results back into cb.arg happens without any corresponding write barriers.
+	// For cgo, cb.arg points into a C stack frame and therefore doesn't
+	// hold any pointers that the GC can find anyway - the write barrier
+	// would be a no-op.
+	reflectcall(nil, unsafe.Pointer(cb.fn), unsafe.Pointer(cb.arg), uint32(cb.argsize), 0)
 
 	if raceenabled {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 58e7702..22c0dfe 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -942,7 +942,7 @@
 				default:
 					throw("bad kind in runfinq")
 				}
-				reflectcall(unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz))
+				reflectcall(nil, unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz))
 
 				// drop finalizer queue references to finalized object
 				f.fn = nil
diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go
index 2833aa7..b9718cb 100644
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@ -104,6 +104,7 @@
 	_PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1)
 )
 
+//go:nosplit
 func needwb() bool {
 	return gcphase == _GCmark || gcphase == _GCmarktermination || mheap_.shadow_enabled
 }
@@ -232,7 +233,7 @@
 	// Apply changes to shadow.
 	// Since *dst has been overwritten already, we cannot check
 	// whether there were any missed updates, but writebarrierptr_nostore
-	// is only rarely used (right now there is just one call, in newstack).
+	// is only rarely used.
 	if mheap_.shadow_enabled {
 		systemstack(func() {
 			addr := uintptr(unsafe.Pointer(dst))
@@ -287,6 +288,12 @@
 // all the combinations of ptr+scalar up to four words.
 // The implementations are written to wbfat.go.
 
+//go:linkname reflect_typedmemmove reflect.typedmemmove
+func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	typedmemmove(typ, dst, src)
+}
+
+// typedmemmove copies a value of type t to dst from src.
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if !needwb() || (typ.kind&kindNoPointers) != 0 {
@@ -322,6 +329,79 @@
 	})
 }
 
+// typedmemmovepartial is like typedmemmove but assumes that
+// dst and src point off bytes into the value and only copies size bytes.
+//go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
+func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if !needwb() || (typ.kind&kindNoPointers) != 0 || size < ptrSize {
+		memmove(dst, src, size)
+		return
+	}
+
+	if off&(ptrSize-1) != 0 {
+		frag := -off & (ptrSize - 1)
+		// frag < size, because size >= ptrSize, checked above.
+		memmove(dst, src, frag)
+		size -= frag
+		dst = add(noescape(dst), frag)
+		src = add(noescape(src), frag)
+		off += frag
+	}
+
+	mask := loadPtrMask(typ)
+	nptr := (off + size) / ptrSize
+	for i := uintptr(off / ptrSize); i < nptr; i++ {
+		bits := mask[i/2] >> ((i & 1) << 2)
+		if (bits>>2)&_BitsMask == _BitsPointer {
+			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+		} else {
+			*(*uintptr)(dst) = *(*uintptr)(src)
+		}
+		// TODO(rsc): The noescape calls should be unnecessary.
+		dst = add(noescape(dst), ptrSize)
+		src = add(noescape(src), ptrSize)
+	}
+	size &= ptrSize - 1
+	if size > 0 {
+		memmove(dst, src, size)
+	}
+}
+
+// callwritebarrier is invoked at the end of reflectcall, to execute
+// write barrier operations to record the fact that a call's return
+// values have just been copied to frame, starting at retoffset
+// and continuing to framesize. The entire frame (not just the return
+// values) is described by typ. Because the copy has already
+// happened, we call writebarrierptr_nostore, and we must be careful
+// not to be preempted before the write barriers have been run.
+//go:nosplit
+func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) {
+	if !needwb() || typ == nil || (typ.kind&kindNoPointers) != 0 || framesize-retoffset < ptrSize {
+		return
+	}
+
+	systemstack(func() {
+		mask := loadPtrMask(typ)
+		// retoffset is known to be pointer-aligned (at least).
+		// TODO(rsc): The noescape call should be unnecessary.
+		dst := add(noescape(frame), retoffset)
+		nptr := framesize / ptrSize
+		for i := uintptr(retoffset / ptrSize); i < nptr; i++ {
+			bits := mask[i/2] >> ((i & 1) << 2)
+			if (bits>>2)&_BitsMask == _BitsPointer {
+				writebarrierptr_nostore((*uintptr)(dst), *(*uintptr)(dst))
+			}
+			// TODO(rsc): The noescape call should be unnecessary.
+			dst = add(noescape(dst), ptrSize)
+		}
+	})
+}
+
+//go:linkname reflect_typedslicecopy reflect.typedslicecopy
+func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
+	return typedslicecopy(elemType, dst, src)
+}
+
 //go:nosplit
 func typedslicecopy(typ *_type, dst, src slice) int {
 	n := dst.len
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 393c769..09278af 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -301,7 +301,7 @@
 			continue
 		}
 		d.started = true
-		reflectcall(unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
 		if gp._defer != d {
 			throw("bad defer entry in Goexit")
 		}
@@ -401,7 +401,7 @@
 		d._panic = (*_panic)(noescape((unsafe.Pointer)(&p)))
 
 		p.argp = unsafe.Pointer(getargp(0))
-		reflectcall(unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
 		p.argp = nil
 
 		// reflectcall did not panic. Remove d.
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 67f78bd..1114a09 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -110,7 +110,17 @@
 func setg(gg *g)
 func breakpoint()
 
-func reflectcall(fn, arg unsafe.Pointer, n uint32, retoffset uint32)
+// reflectcall calls fn with a copy of the n argument bytes pointed at by arg.
+// After fn returns, reflectcall copies n-retoffset result bytes
+// back into arg+retoffset before returning. If copying result bytes back,
+// the caller should pass the argument frame type as argtype, so that
+// call can execute appropriate write barriers during the copy.
+// Package reflect passes a frame type. In package runtime, there is only
+// one call that copies results back, in cgocallbackg1, and it does NOT pass a
+// frame type, meaning there are no write barriers invoked. See that call
+// site for justification.
+func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
+
 func procyield(cycles uint32)
 func cgocallback_gofunc(fv *funcval, frame unsafe.Pointer, framesize uintptr)
 func goexit()