runtime: get rid of most uses of REP for copying/zeroing.

REP MOVSQ and REP STOSQ have a really high startup overhead.
Use a Duff's device to do the repetition instead.

benchmark                 old ns/op     new ns/op     delta
BenchmarkClearFat32       7.20          1.60          -77.78%
BenchmarkCopyFat32        6.88          2.38          -65.41%
BenchmarkClearFat64       7.15          3.20          -55.24%
BenchmarkCopyFat64        6.88          3.44          -50.00%
BenchmarkClearFat128      9.53          5.34          -43.97%
BenchmarkCopyFat128       9.27          5.56          -40.02%
BenchmarkClearFat256      13.8          9.53          -30.94%
BenchmarkCopyFat256       13.5          10.3          -23.70%
BenchmarkClearFat512      22.3          18.0          -19.28%
BenchmarkCopyFat512       22.0          19.7          -10.45%
BenchmarkCopyFat1024      36.5          38.4          +5.21%
BenchmarkClearFat1024     35.1          35.0          -0.28%

TODO: use for stack frame zeroing
TODO: REP prefixes are still used for "reverse" copying when src/dst
regions overlap.  Might be worth fixing.

LGTM=rsc
R=golang-codereviews, rsc
CC=golang-codereviews, r
https://golang.org/cl/81370046
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index bb3bcaf..ee9697c 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -1348,3 +1348,797 @@
 	SETEQ	CX	// 1 if alen == blen
 	LEAL	-1(CX)(AX*2), AX	// 1,0,-1 result
 	RET
+
+// A Duff's device for zeroing memory.
+// The compiler jumps to computed addresses within
+// this routine to zero chunks of memory.  Do not
+// change this code without also changing the code
+// in ../../cmd/8g/ggen.c:clearfat.
+// AX: zero
+// DI: ptr to memory to be zeroed
+// DI is updated as a side effect.
+TEXT runtime·duffzero(SB), NOSPLIT, $0-0
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	RET
+
+// A Duff's device for copying memory.
+// The compiler jumps to computed addresses within
+// this routine to copy chunks of memory.  Source
+// and destination must not overlap.  Do not
+// change this code without also changing the code
+// in ../../cmd/6g/cgen.c:sgen.
+// SI: ptr to source memory
+// DI: ptr to destination memory
+// SI and DI are updated as a side effect.
+
+// NOTE: this is equivalent to a sequence of MOVSL but
+// for some reason MOVSL is really slow.
+TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	RET