cmd/compile: avoid a spill in append fast path

Instead of spilling newlen, recalculate it.
This removes a spill from the fast path,
at the cost of a cheap recalculation
on the (rare) growth path.
This uses 8 bytes less of stack space.
It generates two more bytes of code,
but that is due to suboptimal register allocation;
see far below.

Runtime append microbenchmarks are all over the map,
presumably due to incidental code movement.

Sample code:

func s(b []byte) []byte {
	b = append(b, 1, 2, 3)
	return b
}

Before:

"".s t=1 size=160 args=0x30 locals=0x48
	0x0000 00000 (append.go:8)	TEXT	"".s(SB), $72-48
	0x0000 00000 (append.go:8)	MOVQ	(TLS), CX
	0x0009 00009 (append.go:8)	CMPQ	SP, 16(CX)
	0x000d 00013 (append.go:8)	JLS	149
	0x0013 00019 (append.go:8)	SUBQ	$72, SP
	0x0017 00023 (append.go:8)	FUNCDATA	$0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB)
	0x0017 00023 (append.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0017 00023 (append.go:9)	MOVQ	"".b+88(FP), CX
	0x001c 00028 (append.go:9)	LEAQ	3(CX), DX
	0x0020 00032 (append.go:9)	MOVQ	DX, "".autotmp_0+64(SP)
	0x0025 00037 (append.go:9)	MOVQ	"".b+96(FP), BX
	0x002a 00042 (append.go:9)	CMPQ	DX, BX
	0x002d 00045 (append.go:9)	JGT	$0, 86
	0x002f 00047 (append.go:8)	MOVQ	"".b+80(FP), AX
	0x0034 00052 (append.go:9)	MOVB	$1, (AX)(CX*1)
	0x0038 00056 (append.go:9)	MOVB	$2, 1(AX)(CX*1)
	0x003d 00061 (append.go:9)	MOVB	$3, 2(AX)(CX*1)
	0x0042 00066 (append.go:10)	MOVQ	AX, "".~r1+104(FP)
	0x0047 00071 (append.go:10)	MOVQ	DX, "".~r1+112(FP)
	0x004c 00076 (append.go:10)	MOVQ	BX, "".~r1+120(FP)
	0x0051 00081 (append.go:10)	ADDQ	$72, SP
	0x0055 00085 (append.go:10)	RET
	0x0056 00086 (append.go:9)	LEAQ	type.[]uint8(SB), AX
	0x005d 00093 (append.go:9)	MOVQ	AX, (SP)
	0x0061 00097 (append.go:9)	MOVQ	"".b+80(FP), BP
	0x0066 00102 (append.go:9)	MOVQ	BP, 8(SP)
	0x006b 00107 (append.go:9)	MOVQ	CX, 16(SP)
	0x0070 00112 (append.go:9)	MOVQ	BX, 24(SP)
	0x0075 00117 (append.go:9)	MOVQ	DX, 32(SP)
	0x007a 00122 (append.go:9)	PCDATA	$0, $0
	0x007a 00122 (append.go:9)	CALL	runtime.growslice(SB)
	0x007f 00127 (append.go:9)	MOVQ	40(SP), AX
	0x0084 00132 (append.go:9)	MOVQ	56(SP), BX
	0x0089 00137 (append.go:8)	MOVQ	"".b+88(FP), CX
	0x008e 00142 (append.go:9)	MOVQ	"".autotmp_0+64(SP), DX
	0x0093 00147 (append.go:9)	JMP	52
	0x0095 00149 (append.go:9)	NOP
	0x0095 00149 (append.go:8)	CALL	runtime.morestack_noctxt(SB)
	0x009a 00154 (append.go:8)	JMP	0

After:

"".s t=1 size=176 args=0x30 locals=0x40
	0x0000 00000 (append.go:8)	TEXT	"".s(SB), $64-48
	0x0000 00000 (append.go:8)	MOVQ	(TLS), CX
	0x0009 00009 (append.go:8)	CMPQ	SP, 16(CX)
	0x000d 00013 (append.go:8)	JLS	151
	0x0013 00019 (append.go:8)	SUBQ	$64, SP
	0x0017 00023 (append.go:8)	FUNCDATA	$0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB)
	0x0017 00023 (append.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0017 00023 (append.go:9)	MOVQ	"".b+80(FP), CX
	0x001c 00028 (append.go:9)	LEAQ	3(CX), DX
	0x0020 00032 (append.go:9)	MOVQ	"".b+88(FP), BX
	0x0025 00037 (append.go:9)	CMPQ	DX, BX
	0x0028 00040 (append.go:9)	JGT	$0, 81
	0x002a 00042 (append.go:8)	MOVQ	"".b+72(FP), AX
	0x002f 00047 (append.go:9)	MOVB	$1, (AX)(CX*1)
	0x0033 00051 (append.go:9)	MOVB	$2, 1(AX)(CX*1)
	0x0038 00056 (append.go:9)	MOVB	$3, 2(AX)(CX*1)
	0x003d 00061 (append.go:10)	MOVQ	AX, "".~r1+96(FP)
	0x0042 00066 (append.go:10)	MOVQ	DX, "".~r1+104(FP)
	0x0047 00071 (append.go:10)	MOVQ	BX, "".~r1+112(FP)
	0x004c 00076 (append.go:10)	ADDQ	$64, SP
	0x0050 00080 (append.go:10)	RET
	0x0051 00081 (append.go:9)	LEAQ	type.[]uint8(SB), AX
	0x0058 00088 (append.go:9)	MOVQ	AX, (SP)
	0x005c 00092 (append.go:9)	MOVQ	"".b+72(FP), BP
	0x0061 00097 (append.go:9)	MOVQ	BP, 8(SP)
	0x0066 00102 (append.go:9)	MOVQ	CX, 16(SP)
	0x006b 00107 (append.go:9)	MOVQ	BX, 24(SP)
	0x0070 00112 (append.go:9)	MOVQ	DX, 32(SP)
	0x0075 00117 (append.go:9)	PCDATA	$0, $0
	0x0075 00117 (append.go:9)	CALL	runtime.growslice(SB)
	0x007a 00122 (append.go:9)	MOVQ	40(SP), AX
	0x007f 00127 (append.go:9)	MOVQ	48(SP), CX
	0x0084 00132 (append.go:9)	MOVQ	56(SP), BX
	0x0089 00137 (append.go:9)	ADDQ	$3, CX
	0x008d 00141 (append.go:9)	MOVQ	CX, DX
	0x0090 00144 (append.go:8)	MOVQ	"".b+80(FP), CX
	0x0095 00149 (append.go:9)	JMP	47
	0x0097 00151 (append.go:9)	NOP
	0x0097 00151 (append.go:8)	CALL	runtime.morestack_noctxt(SB)
	0x009c 00156 (append.go:8)	JMP	0

Observe that in the following sequence,
we should use DX directly instead of using
CX as a temporary register, which would make
the new code a strict improvement on the old:

	0x007f 00127 (append.go:9)	MOVQ	48(SP), CX
	0x0084 00132 (append.go:9)	MOVQ	56(SP), BX
	0x0089 00137 (append.go:9)	ADDQ	$3, CX
	0x008d 00141 (append.go:9)	MOVQ	CX, DX
	0x0090 00144 (append.go:8)	MOVQ	"".b+80(FP), CX

Change-Id: I4ee50b18fa53865901d2d7f86c2cbb54c6fa6924
Reviewed-on: https://go-review.googlesource.com/21812
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 7c5f906..d69559d9 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -337,12 +337,13 @@
 	memVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "mem"}}
 
 	// dummy nodes for temporary variables
-	ptrVar   = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}}
-	capVar   = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}}
-	typVar   = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}}
-	idataVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "idata"}}
-	okVar    = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ok"}}
-	deltaVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "delta"}}
+	ptrVar    = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}}
+	newlenVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "newlen"}}
+	capVar    = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}}
+	typVar    = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}}
+	idataVar  = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "idata"}}
+	okVar     = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ok"}}
+	deltaVar  = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "delta"}}
 )
 
 // startBlock sets the current block we're generating code in to b.
@@ -2089,15 +2090,16 @@
 // exprAppend converts an OAPPEND node n to an ssa.Value, adds it to s, and returns the Value.
 func (s *state) exprAppend(n *Node) *ssa.Value {
 	// append(s, e1, e2, e3).  Compile like:
-	// ptr,len,cap := s
+	// ptr, len, cap := s
 	// newlen := len + 3
 	// if newlen > s.cap {
-	//     ptr,_,cap = growslice(s, newlen)
+	//     ptr, len, cap = growslice(s, newlen)
+	//     newlen = len + 3 // recalculate to avoid a spill
 	// }
 	// *(ptr+len) = e1
 	// *(ptr+len+1) = e2
 	// *(ptr+len+2) = e3
-	// makeslice(ptr,newlen,cap)
+	// makeslice(ptr, newlen, cap)
 
 	et := n.Type.Elem()
 	pt := Ptrto(et)
@@ -2117,6 +2119,7 @@
 	nl := s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], l, s.constInt(Types[TINT], nargs))
 	cmp := s.newValue2(s.ssaOp(OGT, Types[TINT]), Types[TBOOL], nl, c)
 	s.vars[&ptrVar] = p
+	s.vars[&newlenVar] = nl
 	s.vars[&capVar] = c
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
@@ -2132,8 +2135,7 @@
 	r := s.rtcall(growslice, true, []*Type{pt, Types[TINT], Types[TINT]}, taddr, p, l, c, nl)
 
 	s.vars[&ptrVar] = r[0]
-	// Note: we don't need to read r[1], the result's length. It will be nl.
-	// (or maybe we should, we just have to spill/restore nl otherwise?)
+	s.vars[&newlenVar] = s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], r[1], s.constInt(Types[TINT], nargs))
 	s.vars[&capVar] = r[2]
 	b = s.endBlock()
 	b.AddEdgeTo(assign)
@@ -2154,8 +2156,9 @@
 		}
 	}
 
-	p = s.variable(&ptrVar, pt)          // generates phi for ptr
-	c = s.variable(&capVar, Types[TINT]) // generates phi for cap
+	p = s.variable(&ptrVar, pt)              // generates phi for ptr
+	nl = s.variable(&newlenVar, Types[TINT]) // generates phi for nl
+	c = s.variable(&capVar, Types[TINT])     // generates phi for cap
 	p2 := s.newValue2(ssa.OpPtrIndex, pt, p, l)
 	// TODO: just one write barrier call for all of these writes?
 	// TODO: maybe just one writeBarrier.enabled check?
@@ -2178,6 +2181,7 @@
 
 	// make result
 	delete(s.vars, &ptrVar)
+	delete(s.vars, &newlenVar)
 	delete(s.vars, &capVar)
 	return s.newValue3(ssa.OpSliceMake, n.Type, p, nl, c)
 }