cmd/compile,cmd/internal/obj/riscv,runtime: use Duff's devices on riscv64

Implement runtime.duffzero and runtime.duffcopy for riscv64.
Use obj.ADUFFZERO/obj.ADUFFCOPY for medium size, word aligned
zeroing/moving.

Change-Id: I42ec622055630c94cb77e286d8d33dbe7c9f846c
Reviewed-on: https://go-review.googlesource.com/c/go/+/237797
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/riscv64/ggen.go b/src/cmd/compile/internal/riscv64/ggen.go
index be31fad..f7c03fe 100644
--- a/src/cmd/compile/internal/riscv64/ggen.go
+++ b/src/cmd/compile/internal/riscv64/ggen.go
@@ -25,7 +25,15 @@
 		return p
 	}
 
-	// TODO(jsing): Add a duff zero implementation for medium sized ranges.
+	if cnt <= int64(128*gc.Widthptr) {
+		p = pp.Appendpp(p, riscv.AADDI, obj.TYPE_CONST, 0, off, obj.TYPE_REG, riscv.REG_A0, 0)
+		p.Reg = riscv.REG_SP
+		p = pp.Appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_MEM, 0, 0)
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Duffzero
+		p.To.Offset = 8 * (128 - cnt/int64(gc.Widthptr))
+		return p
+	}
 
 	// Loop, zeroing pointer width bytes at a time.
 	// ADD	$(off), SP, T0
diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
index 064a1ca..0beb5b4 100644
--- a/src/cmd/compile/internal/riscv64/ssa.go
+++ b/src/cmd/compile/internal/riscv64/ssa.go
@@ -608,6 +608,20 @@
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 
+	case ssa.OpRISCV64DUFFZERO:
+		p := s.Prog(obj.ADUFFZERO)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Duffzero
+		p.To.Offset = v.AuxInt
+
+	case ssa.OpRISCV64DUFFCOPY:
+		p := s.Prog(obj.ADUFFCOPY)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Duffcopy
+		p.To.Offset = v.AuxInt
+
 	default:
 		v.Fatalf("Unhandled op %v", v.Op)
 	}
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64.rules b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
index 3bc2e84..325cbeb 100644
--- a/src/cmd/compile/internal/ssa/gen/RISCV64.rules
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
@@ -360,6 +360,13 @@
 (Zero [4] ptr mem) => (MOVWstore ptr (MOVWconst) mem)
 (Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst) mem)
 
+// Medium zeroing uses a Duff's device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+	&& s%8 == 0 && s >= 16 && s <= 8*128
+	&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [8 * (128 - s/8)] ptr mem)
+
 // Generic zeroing uses a loop
 (Zero [s] {t} ptr mem) =>
 	(LoweredZero [t.Alignment()]
@@ -395,6 +402,13 @@
 (Move [4] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
 (Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
 
+// Medium move uses a Duff's device
+// 16 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+	&& s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
+
 // Generic move uses a loop
 (Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
 	(LoweredMove [t.Alignment()]
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
index ebd515b..f643192 100644
--- a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
@@ -240,6 +240,44 @@
 		{name: "CALLclosure", argLength: 3, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
 		{name: "CALLinter", argLength: 2, reg: callInter, aux: "CallOff", call: true},     // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
 
+		// duffzero
+		// arg0 = address of memory to zero (in X10, changed as side effect)
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy
+		// arg0 = address of dst memory (in X11, changed as side effect)
+		// arg1 = address of src memory (in X10, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X11"], regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"] | regNamed["X11"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
 		// Generic moves and zeros
 
 		// general unaligned zeroing
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index bb1cbc0..96aa3ad 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2111,6 +2111,8 @@
 	OpRISCV64CALLstatic
 	OpRISCV64CALLclosure
 	OpRISCV64CALLinter
+	OpRISCV64DUFFZERO
+	OpRISCV64DUFFCOPY
 	OpRISCV64LoweredZero
 	OpRISCV64LoweredMove
 	OpRISCV64LoweredAtomicLoad8
@@ -28163,6 +28165,32 @@
 		},
 	},
 	{
+		name:           "DUFFZERO",
+		auxType:        auxInt64,
+		argLen:         2,
+		faultOnNilArg0: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 512}, // X10
+			},
+			clobbers: 512, // X10
+		},
+	},
+	{
+		name:           "DUFFCOPY",
+		auxType:        auxInt64,
+		argLen:         3,
+		faultOnNilArg0: true,
+		faultOnNilArg1: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1024}, // X11
+				{1, 512},  // X10
+			},
+			clobbers: 1536, // X10 X11
+		},
+	},
+	{
 		name:           "LoweredZero",
 		auxType:        auxInt64,
 		argLen:         3,
diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
index ac92945..c337ffb 100644
--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
@@ -2017,6 +2017,23 @@
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
+	// cond: s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
+	// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		t := auxToType(v.Aux)
+		dst := v_0
+		src := v_1
+		mem := v_2
+		if !(s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
+			break
+		}
+		v.reset(OpRISCV64DUFFCOPY)
+		v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
+		v.AddArg3(dst, src, mem)
+		return true
+	}
+	// match: (Move [s] {t} dst src mem)
 	// cond: (s <= 16 || logLargeCopy(v, s))
 	// result: (LoweredMove [t.Alignment()] dst src (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src) mem)
 	for {
@@ -5650,6 +5667,22 @@
 		return true
 	}
 	// match: (Zero [s] {t} ptr mem)
+	// cond: s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice
+	// result: (DUFFZERO [8 * (128 - s/8)] ptr mem)
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		t := auxToType(v.Aux)
+		ptr := v_0
+		mem := v_1
+		if !(s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice) {
+			break
+		}
+		v.reset(OpRISCV64DUFFZERO)
+		v.AuxInt = int64ToAuxInt(8 * (128 - s/8))
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	// match: (Zero [s] {t} ptr mem)
 	// result: (LoweredZero [t.Alignment()] ptr (ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
index da49f67..5301e44 100644
--- a/src/cmd/internal/obj/riscv/obj.go
+++ b/src/cmd/internal/obj/riscv/obj.go
@@ -33,7 +33,7 @@
 // lr is the link register to use for the JALR.
 // p must be a CALL, JMP or RET.
 func jalrToSym(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc, lr int16) *obj.Prog {
-	if p.As != obj.ACALL && p.As != obj.AJMP && p.As != obj.ARET {
+	if p.As != obj.ACALL && p.As != obj.AJMP && p.As != obj.ARET && p.As != obj.ADUFFZERO && p.As != obj.ADUFFCOPY {
 		ctxt.Diag("unexpected Prog in jalrToSym: %v", p)
 		return p
 	}
@@ -417,7 +417,7 @@
 	// CALLs are CALL or JAL(R) with link register LR.
 	for p := sym.Func().Text; p != nil; p = p.Link {
 		switch p.As {
-		case obj.ACALL:
+		case obj.ACALL, obj.ADUFFZERO, obj.ADUFFCOPY:
 			return true
 		case AJAL, AJALR:
 			if p.From.Type == obj.TYPE_REG && p.From.Reg == REG_LR {
@@ -656,7 +656,7 @@
 				p.From.Reg = REG_SP
 			}
 
-		case obj.ACALL:
+		case obj.ACALL, obj.ADUFFZERO, obj.ADUFFCOPY:
 			switch p.To.Type {
 			case obj.TYPE_MEM:
 				jalrToSym(ctxt, p, newprog, REG_LR)
@@ -1696,6 +1696,8 @@
 	obj.APCDATA:   pseudoOpEncoding,
 	obj.ATEXT:     pseudoOpEncoding,
 	obj.ANOP:      pseudoOpEncoding,
+	obj.ADUFFZERO: pseudoOpEncoding,
+	obj.ADUFFCOPY: pseudoOpEncoding,
 }
 
 // encodingForAs returns the encoding for an obj.As.
diff --git a/src/runtime/duff_riscv64.s b/src/runtime/duff_riscv64.s
new file mode 100644
index 0000000..f7bd3f3
--- /dev/null
+++ b/src/runtime/duff_riscv64.s
@@ -0,0 +1,907 @@
+// Code generated by mkduff.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+#include "textflag.h"
+
+TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	RET
+
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	RET
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 8859ed6..6ddf025 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -38,6 +38,7 @@
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
+	gen("riscv64", notags, zeroRISCV64, copyRISCV64)
 }
 
 func gen(arch string, tags, zero, copy func(io.Writer)) {
@@ -227,3 +228,30 @@
 	}
 	fmt.Fprintln(w, "\tRET")
 }
+
+func zeroRISCV64(w io.Writer) {
+	// ZERO: always zero
+	// X10: ptr to memory to be zeroed
+	// X10 is updated as a side effect.
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\tZERO, (X10)")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+	}
+	fmt.Fprintln(w, "\tRET")
+}
+
+func copyRISCV64(w io.Writer) {
+	// X10: ptr to source memory
+	// X11: ptr to destination memory
+	// X10 and X11 are updated as a side effect
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\t(X10), X31")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+		fmt.Fprintln(w, "\tMOV\tX31, (X11)")
+		fmt.Fprintln(w, "\tADD\t$8, X11")
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintln(w, "\tRET")
+}