cmd/internal/gc: optimize append + write barrier The code generated for x = append(x, v) is roughly: t := x if len(t)+1 > cap(t) { t = grow(t) } t[len(t)] = v len(t)++ x = t We used to generate this code as Go pseudocode during walk. Generate it instead as actual instructions during gen. Doing so lets us apply a few optimizations. The most important is that when, as in the above example, the source slice and the destination slice are the same, the code can instead do: t := x if len(t)+1 > cap(t) { t = grow(t) x = {base(t), len(t)+1, cap(t)} } else { len(x)++ } t[len(t)] = v That is, in the fast path that does not reallocate the array, only the updated length needs to be written back to x, not the array pointer and not the capacity. This is more like what you'd write by hand in C. It's faster in general, since the fast path elides two of the three stores, but it's especially faster when the form of x is such that the base pointer write would turn into a write barrier. No write, no barrier. name old mean new mean delta BinaryTree17 5.68s × (0.97,1.04) 5.81s × (0.98,1.03) +2.35% (p=0.023) Fannkuch11 4.41s × (0.98,1.03) 4.35s × (1.00,1.00) ~ (p=0.090) FmtFprintfEmpty 92.7ns × (0.91,1.16) 86.0ns × (0.94,1.11) -7.31% (p=0.038) FmtFprintfString 281ns × (0.96,1.08) 276ns × (0.98,1.04) ~ (p=0.219) FmtFprintfInt 288ns × (0.97,1.06) 274ns × (0.98,1.06) -4.94% (p=0.002) FmtFprintfIntInt 493ns × (0.97,1.04) 506ns × (0.99,1.01) +2.65% (p=0.009) FmtFprintfPrefixedInt 423ns × (0.97,1.04) 391ns × (0.99,1.01) -7.52% (p=0.000) FmtFprintfFloat 598ns × (0.99,1.01) 566ns × (0.99,1.01) -5.27% (p=0.000) FmtManyArgs 1.89µs × (0.98,1.05) 1.91µs × (0.99,1.01) ~ (p=0.231) GobDecode 14.8ms × (0.98,1.03) 15.3ms × (0.99,1.02) +3.01% (p=0.000) GobEncode 12.3ms × (0.98,1.01) 11.5ms × (0.97,1.03) -5.93% (p=0.000) Gzip 656ms × (0.99,1.05) 645ms × (0.99,1.01) ~ (p=0.055) Gunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) -0.32% (p=0.034) HTTPClientServer 91.2µs × (0.97,1.04) 90.5µs × (0.97,1.04) ~ (p=0.468) JSONEncode 32.6ms × (0.97,1.08) 32.0ms × (0.98,1.03) ~ (p=0.190) JSONDecode 114ms × (0.97,1.05) 114ms × (0.99,1.01) ~ (p=0.887) Mandelbrot200 6.11ms × (0.98,1.04) 6.04ms × (1.00,1.01) ~ (p=0.167) GoParse 6.66ms × (0.97,1.04) 6.47ms × (0.97,1.05) -2.81% (p=0.014) RegexpMatchEasy0_32 159ns × (0.99,1.00) 171ns × (0.93,1.07) +7.19% (p=0.002) RegexpMatchEasy0_1K 538ns × (1.00,1.01) 550ns × (0.98,1.01) +2.30% (p=0.000) RegexpMatchEasy1_32 138ns × (1.00,1.00) 135ns × (0.99,1.02) -1.60% (p=0.000) RegexpMatchEasy1_1K 869ns × (0.99,1.01) 879ns × (1.00,1.01) +1.08% (p=0.000) RegexpMatchMedium_32 252ns × (0.99,1.01) 243ns × (1.00,1.00) -3.71% (p=0.000) RegexpMatchMedium_1K 72.7µs × (1.00,1.00) 70.3µs × (1.00,1.00) -3.34% (p=0.000) RegexpMatchHard_32 3.85µs × (1.00,1.00) 3.82µs × (1.00,1.01) -0.81% (p=0.000) RegexpMatchHard_1K 118µs × (1.00,1.00) 117µs × (1.00,1.00) -0.56% (p=0.000) Revcomp 920ms × (0.97,1.07) 917ms × (0.97,1.04) ~ (p=0.808) Template 129ms × (0.98,1.03) 114ms × (0.99,1.01) -12.06% (p=0.000) TimeParse 619ns × (0.99,1.01) 622ns × (0.99,1.01) ~ (p=0.062) TimeFormat 661ns × (0.98,1.04) 665ns × (0.99,1.01) ~ (p=0.524) See next CL for combination with a similar optimization for slice. The benchmarks that are slower in this CL are still faster overall with the combination of the two. Change-Id: I2a7421658091b2488c64741b4db15ab6c3b4cb7e Reviewed-on: https://go-review.googlesource.com/9812 Reviewed-by: David Chase <drchase@google.com>

commit: 8552047a32cccccc1c376e49048c5a22494b7611 [log] [tgz]
author: Russ Cox <rsc@golang.org> Wed May 06 12:34:30 2015 -0400
committer: Russ Cox <rsc@golang.org> Tue May 12 17:55:09 2015 +0000
tree: 1e20c9093d958d7e2e6081ee38653af56368db82
parent: f8d14fc3a00052727c717f62908a9db661483093 [diff] [blame]
diff --git a/src/cmd/internal/gc/cgen.go b/src/cmd/internal/gc/cgen.go
index 92a670d..0c847c2 100644
--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go

@@ -67,6 +67,10 @@
 	case ODOTTYPE:
 		cgen_dottype(n, res, nil, wb)
 		return
+
+	case OAPPEND:
+		cgen_append(n, res)
+		return
 	}
 
 	if n.Ullman >= UINF {
@@ -2786,3 +2790,182 @@
 		n.Xoffset = 0
 	}
 }
+
+func cgen_append(n, res *Node) {
+	if Debug['g'] != 0 {
+		Dump("cgen_append-n", n)
+		Dump("cgen_append-res", res)
+	}
+	if res.Op != ONAME && !samesafeexpr(res, n.List.N) {
+		Dump("cgen_append-n", n)
+		Dump("cgen_append-res", res)
+		Fatal("append not lowered")
+	}
+	for l := n.List; l != nil; l = l.Next {
+		if l.N.Ullman >= UINF {
+			Fatal("append with function call arguments")
+		}
+	}
+
+	// res = append(src, x, y, z)
+	//
+	// If res and src are the same, we can avoid writing to base and cap
+	// unless we grow the underlying array.
+	needFullUpdate := !samesafeexpr(res, n.List.N)
+
+	// Copy src triple into base, len, cap.
+	base := temp(Types[Tptr])
+	len := temp(Types[TUINT])
+	cap := temp(Types[TUINT])
+
+	var src Node
+	Igen(n.List.N, &src, nil)
+	src.Type = Types[Tptr]
+	Thearch.Gmove(&src, base)
+	src.Type = Types[TUINT]
+	src.Xoffset += int64(Widthptr)
+	Thearch.Gmove(&src, len)
+	src.Xoffset += int64(Widthptr)
+	Thearch.Gmove(&src, cap)
+
+	// if len+argc <= cap goto L1
+	var rlen Node
+	Regalloc(&rlen, Types[TUINT], nil)
+	Thearch.Gmove(len, &rlen)
+	Thearch.Ginscon(Thearch.Optoas(OADD, Types[TUINT]), int64(count(n.List)-1), &rlen)
+	p := Thearch.Ginscmp(OLE, Types[TUINT], &rlen, cap, +1)
+	// Note: rlen and src are Regrealloc'ed below at the target of the
+	// branch we just emitted; do not reuse these Go variables for
+	// other purposes. They need to still describe the same things
+	// below that they describe right here.
+	Regfree(&src)
+
+	// base, len, cap = growslice(type, base, len, cap, newlen)
+	var arg Node
+	arg.Op = OINDREG
+	arg.Reg = int16(Thearch.REGSP)
+	arg.Addable = true
+	arg.Xoffset = 0
+	if HasLinkRegister() {
+		arg.Xoffset = int64(Ctxt.Arch.Ptrsize)
+	}
+	arg.Type = Ptrto(Types[TUINT8])
+	Cgen(typename(res.Type), &arg)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[Tptr]
+	Cgen(base, &arg)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[TUINT]
+	Cgen(len, &arg)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[TUINT]
+	Cgen(cap, &arg)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[TUINT]
+	Cgen(&rlen, &arg)
+	arg.Xoffset += int64(Widthptr)
+	Regfree(&rlen)
+
+	fn := syslook("growslice", 1)
+	substArgTypes(fn, res.Type.Type, res.Type.Type)
+	Ginscall(fn, 0)
+
+	if Widthptr == 4 && Widthreg == 8 {
+		arg.Xoffset += 4
+	}
+
+	arg.Type = Types[Tptr]
+	Cgen(&arg, base)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[TUINT]
+	Cgen(&arg, len)
+	arg.Xoffset += int64(Widthptr)
+
+	arg.Type = Types[TUINT]
+	Cgen(&arg, cap)
+
+	// Update res with base, len+argc, cap.
+	if needFullUpdate {
+		if Debug_append > 0 {
+			Warn("append: full update")
+		}
+		Patch(p, Pc)
+	}
+	if res.Op == ONAME {
+		Gvardef(res)
+	}
+	var dst, r1 Node
+	Igen(res, &dst, nil)
+	dst.Type = Types[TUINT]
+	dst.Xoffset += int64(Widthptr)
+	Regalloc(&r1, Types[TUINT], nil)
+	Thearch.Gmove(len, &r1)
+	Thearch.Ginscon(Thearch.Optoas(OADD, Types[TUINT]), int64(count(n.List)-1), &r1)
+	Thearch.Gmove(&r1, &dst)
+	Regfree(&r1)
+	dst.Xoffset += int64(Widthptr)
+	Thearch.Gmove(cap, &dst)
+	dst.Type = Types[Tptr]
+	dst.Xoffset -= 2 * int64(Widthptr)
+	cgen_wb(base, &dst, needwritebarrier(&dst, base))
+	Regfree(&dst)
+
+	if !needFullUpdate {
+		if Debug_append > 0 {
+			Warn("append: len-only update")
+		}
+		// goto L2;
+		// L1:
+		//	update len only
+		// L2:
+		q := Gbranch(obj.AJMP, nil, 0)
+		Patch(p, Pc)
+		// At the goto above, src refers to cap and rlen holds the new len
+		if src.Op == OREGISTER || src.Op == OINDREG {
+			Regrealloc(&src)
+		}
+		Regrealloc(&rlen)
+		src.Xoffset -= int64(Widthptr)
+		Thearch.Gmove(&rlen, &src)
+		Regfree(&src)
+		Regfree(&rlen)
+		Patch(q, Pc)
+	}
+
+	// Copy data into place.
+	// Could do write barrier check around entire copy instead of each element.
+	// Could avoid reloading registers on each iteration if we know the cgen_wb
+	// is not going to use a write barrier.
+	i := 0
+	var r2 Node
+	for l := n.List.Next; l != nil; l = l.Next {
+		Regalloc(&r1, Types[Tptr], nil)
+		Thearch.Gmove(base, &r1)
+		Regalloc(&r2, Types[TUINT], nil)
+		Thearch.Gmove(len, &r2)
+		if i > 0 {
+			Thearch.Gins(Thearch.Optoas(OADD, Types[TUINT]), Nodintconst(int64(i)), &r2)
+		}
+		w := res.Type.Type.Width
+		if Thearch.AddIndex != nil && Thearch.AddIndex(&r2, w, &r1) {
+			// r1 updated by back end
+		} else if w == 1 {
+			Thearch.Gins(Thearch.Optoas(OADD, Types[Tptr]), &r2, &r1)
+		} else {
+			Thearch.Ginscon(Thearch.Optoas(OMUL, Types[TUINT]), int64(w), &r2)
+			Thearch.Gins(Thearch.Optoas(OADD, Types[Tptr]), &r2, &r1)
+		}
+		Regfree(&r2)
+
+		r1.Op = OINDREG
+		r1.Type = res.Type.Type
+		cgen_wb(l.N, &r1, needwritebarrier(&r1, l.N))
+		Regfree(&r1)
+		i++
+	}
+}
commit	8552047a32cccccc1c376e49048c5a22494b7611	[log] [tgz]
author	Russ Cox <rsc@golang.org>	Wed May 06 12:34:30 2015 -0400
committer	Russ Cox <rsc@golang.org>	Tue May 12 17:55:09 2015 +0000
tree	1e20c9093d958d7e2e6081ee38653af56368db82
parent	f8d14fc3a00052727c717f62908a9db661483093 [diff] [blame]