runtime: buffered write barrier for 386

Updates #22460.

Change-Id: I3c8e90fd6bcda7e28911036591873d63665aaca7
Reviewed-on: https://go-review.googlesource.com/92696
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/gc/main.go b/src/cmd/compile/internal/gc/main.go
index b651c9a..34ab297 100644
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@@ -407,9 +407,11 @@
 		Debug['l'] = 1 - Debug['l']
 	}
 
-	// The buffered write barrier is only implemented on amd64
-	// right now.
-	if objabi.GOARCH != "amd64" {
+	switch objabi.GOARCH {
+	case "amd64", "386":
+	default:
+		// Other architectures don't support the buffered
+		// write barrier yet.
 		Debug_eagerwb = 1
 	}
 
diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go
index ae6caee..a4f4d7e 100644
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -294,6 +294,17 @@
 
 		// runtime call clobber R12 on nacl
 		opcodeTable[OpARMCALLudiv].reg.clobbers |= 1 << 12 // R12
+
+		// Returns clobber BP on nacl/386, so the write
+		// barrier does.
+		opcodeTable[Op386LoweredWB].reg.clobbers |= 1 << 5 // BP
+	}
+
+	if ctxt.Flag_shared {
+		// LoweredWB is secretly a CALL and CALLs on 386 in
+		// shared mode get rewritten by obj6.go to go through
+		// the GOT, which clobbers BX.
+		opcodeTable[Op386LoweredWB].reg.clobbers |= 1 << 3 // BX
 	}
 
 	// cutoff is compared with product of numblocks and numvalues,
diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules
index e012891..8fc7d0d 100644
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -407,6 +407,9 @@
 
 (If cond yes no) -> (NE (TESTB cond cond) yes no)
 
+// Write barrier.
+(WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)
+
 // ***************************
 // Above: lowering rules
 // Below: optimizations
diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go
index f5f46fa..d9aaf5d 100644
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -450,6 +450,10 @@
 		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
 		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
 
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary, but may clobber others.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
 		// MOVLconvert converts between pointers and integers.
 		// We have a special op for this so as to not confuse GC
 		// (particularly stack maps).  It takes a memory arg so it
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 7569545..200298d 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -399,6 +399,7 @@
 	Op386LoweredGetCallerPC
 	Op386LoweredGetCallerSP
 	Op386LoweredNilCheck
+	Op386LoweredWB
 	Op386MOVLconvert
 	Op386FlagEQ
 	Op386FlagLT_ULT
@@ -4390,6 +4391,20 @@
 		},
 	},
 	{
+		name:         "LoweredWB",
+		auxType:      auxSym,
+		argLen:       3,
+		clobberFlags: true,
+		symEffect:    SymNone,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 128}, // DI
+				{1, 1},   // AX
+			},
+			clobbers: 65280, // X0 X1 X2 X3 X4 X5 X6 X7
+		},
+	},
+	{
 		name:         "MOVLconvert",
 		argLen:       2,
 		resultInArg0: true,
diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go
index 32e8608..286df9d 100644
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@@ -577,6 +577,8 @@
 		return rewriteValue386_OpTrunc32to16_0(v)
 	case OpTrunc32to8:
 		return rewriteValue386_OpTrunc32to8_0(v)
+	case OpWB:
+		return rewriteValue386_OpWB_0(v)
 	case OpXor16:
 		return rewriteValue386_OpXor16_0(v)
 	case OpXor32:
@@ -17922,6 +17924,24 @@
 		return true
 	}
 }
+func rewriteValue386_OpWB_0(v *Value) bool {
+	// match: (WB {fn} destptr srcptr mem)
+	// cond:
+	// result: (LoweredWB {fn} destptr srcptr mem)
+	for {
+		fn := v.Aux
+		_ = v.Args[2]
+		destptr := v.Args[0]
+		srcptr := v.Args[1]
+		mem := v.Args[2]
+		v.reset(Op386LoweredWB)
+		v.Aux = fn
+		v.AddArg(destptr)
+		v.AddArg(srcptr)
+		v.AddArg(mem)
+		return true
+	}
+}
 func rewriteValue386_OpXor16_0(v *Value) bool {
 	// match: (Xor16 x y)
 	// cond:
diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go
index 69217f2..17ce803 100644
--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@@ -691,6 +691,12 @@
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 
+	case ssa.Op386LoweredWB:
+		p := s.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = v.Aux.(*obj.LSym)
+
 	case ssa.Op386CALLstatic, ssa.Op386CALLclosure, ssa.Op386CALLinter:
 		s.Call(v)
 	case ssa.Op386NEGL,
diff --git a/src/cmd/vet/all/whitelist/386.txt b/src/cmd/vet/all/whitelist/386.txt
index 505856f..744ac65 100644
--- a/src/cmd/vet/all/whitelist/386.txt
+++ b/src/cmd/vet/all/whitelist/386.txt
@@ -25,3 +25,5 @@
 runtime/asm_386.s: [386] float64touint32: function float64touint32 missing Go declaration
 
 runtime/asm_386.s: [386] stackcheck: function stackcheck missing Go declaration
+
+runtime/asm_ARCHSUFF.s: [GOARCH] gcWriteBarrier: function gcWriteBarrier missing Go declaration
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 80a1451..ee6d768 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -1695,3 +1695,61 @@
 	MOVL	4(SP), AX
 	MOVL	AX, ret+8(FP)
 	RET
+
+// gcWriteBarrier performs a heap pointer write and informs the GC.
+//
+// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
+// - DI is the destination of the write
+// - AX is the value being written at DI
+// It clobbers FLAGS. It does not clobber any general-purpose registers,
+// but may clobber others (e.g., SSE registers).
+TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$28
+	// Save the registers clobbered by the fast path. This is slightly
+	// faster than having the caller spill these.
+	MOVL	CX, 20(SP)
+	MOVL	BX, 24(SP)
+	// TODO: Consider passing g.m.p in as an argument so they can be shared
+	// across a sequence of write barriers.
+	get_tls(BX)
+	MOVL	g(BX), BX
+	MOVL	g_m(BX), BX
+	MOVL	m_p(BX), BX
+	MOVL	(p_wbBuf+wbBuf_next)(BX), CX
+	// Increment wbBuf.next position.
+	LEAL	8(CX), CX
+	MOVL	CX, (p_wbBuf+wbBuf_next)(BX)
+	CMPL	CX, (p_wbBuf+wbBuf_end)(BX)
+	// Record the write.
+	MOVL	AX, -8(CX)	// Record value
+	MOVL	(DI), BX	// TODO: This turns bad writes into bad reads.
+	MOVL	BX, -4(CX)	// Record *slot
+	// Is the buffer full? (flags set in CMPL above)
+	JEQ	flush
+ret:
+	MOVL	20(SP), CX
+	MOVL	24(SP), BX
+	// Do the write.
+	MOVL	AX, (DI)
+	RET
+
+flush:
+	// Save all general purpose registers since these could be
+	// clobbered by wbBufFlush and were not saved by the caller.
+	MOVL	DI, 0(SP)	// Also first argument to wbBufFlush
+	MOVL	AX, 4(SP)	// Also second argument to wbBufFlush
+	// BX already saved
+	// CX already saved
+	MOVL	DX, 8(SP)
+	MOVL	BP, 12(SP)
+	MOVL	SI, 16(SP)
+	// DI already saved
+
+	// This takes arguments DI and AX
+	CALL	runtime·wbBufFlush(SB)
+
+	MOVL	0(SP), DI
+	MOVL	4(SP), AX
+	MOVL	8(SP), DX
+	MOVL	12(SP), BP
+	MOVL	16(SP), SI
+	JMP	ret