cmd/internal/gc: inline runtime.getg

This more closely restores what the old C runtime did.
(In C, g was an 'extern register' with the same effective
implementation as in this CL.)

On a late 2012 MacBookPro10,2, best of 5 old vs best of 5 new:

benchmark                          old ns/op      new ns/op      delta
BenchmarkBinaryTree17              4981312777     4463426605     -10.40%
BenchmarkFannkuch11                3046495712     3006819428     -1.30%
BenchmarkFmtFprintfEmpty           89.3           79.8           -10.64%
BenchmarkFmtFprintfString          284            262            -7.75%
BenchmarkFmtFprintfInt             282            262            -7.09%
BenchmarkFmtFprintfIntInt          480            448            -6.67%
BenchmarkFmtFprintfPrefixedInt     382            358            -6.28%
BenchmarkFmtFprintfFloat           529            486            -8.13%
BenchmarkFmtManyArgs               1849           1773           -4.11%
BenchmarkGobDecode                 12835963       11794385       -8.11%
BenchmarkGobEncode                 10527170       10288422       -2.27%
BenchmarkGzip                      436109569      438422516      +0.53%
BenchmarkGunzip                    110121663      109843648      -0.25%
BenchmarkHTTPClientServer          81930          85446          +4.29%
BenchmarkJSONEncode                24638574       24280603       -1.45%
BenchmarkJSONDecode                93022423       85753546       -7.81%
BenchmarkMandelbrot200             4703899        4735407        +0.67%
BenchmarkGoParse                   5319853        5086843        -4.38%
BenchmarkRegexpMatchEasy0_32       151            151            +0.00%
BenchmarkRegexpMatchEasy0_1K       452            453            +0.22%
BenchmarkRegexpMatchEasy1_32       131            132            +0.76%
BenchmarkRegexpMatchEasy1_1K       761            722            -5.12%
BenchmarkRegexpMatchMedium_32      228            224            -1.75%
BenchmarkRegexpMatchMedium_1K      63751          64296          +0.85%
BenchmarkRegexpMatchHard_32        3188           3238           +1.57%
BenchmarkRegexpMatchHard_1K        95396          96756          +1.43%
BenchmarkRevcomp                   661587262      687107364      +3.86%
BenchmarkTemplate                  108312598      104008540      -3.97%
BenchmarkTimeParse                 453            459            +1.32%
BenchmarkTimeFormat                475            441            -7.16%

The garbage benchmark from the benchmarks subrepo gets 2.6% faster as well.

Change-Id: I320aeda332db81012688b26ffab23f6581c59cfa
Reviewed-on: https://go-review.googlesource.com/8460
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
diff --git a/src/cmd/5g/galign.go b/src/cmd/5g/galign.go
index 1b349e1..0a6c655 100644
--- a/src/cmd/5g/galign.go
+++ b/src/cmd/5g/galign.go
@@ -64,6 +64,7 @@
 	gc.Thearch.Defframe = defframe
 	gc.Thearch.Excise = excise
 	gc.Thearch.Expandchecks = expandchecks
+	gc.Thearch.Getg = getg
 	gc.Thearch.Gins = gins
 	gc.Thearch.Ginscon = ginscon
 	gc.Thearch.Ginsnop = ginsnop
diff --git a/src/cmd/5g/ggen.go b/src/cmd/5g/ggen.go
index 753c6e0..edad7af 100644
--- a/src/cmd/5g/ggen.go
+++ b/src/cmd/5g/ggen.go
@@ -494,3 +494,10 @@
 	}
 	return false
 }
+
+// res = runtime.getg()
+func getg(res *gc.Node) {
+	var n1 gc.Node
+	gc.Nodreg(&n1, res.Type, arm.REGG)
+	gmove(&n1, res)
+}
diff --git a/src/cmd/6g/galign.go b/src/cmd/6g/galign.go
index a73ddc6..74be60e 100644
--- a/src/cmd/6g/galign.go
+++ b/src/cmd/6g/galign.go
@@ -96,6 +96,7 @@
 	gc.Thearch.Dodiv = dodiv
 	gc.Thearch.Excise = excise
 	gc.Thearch.Expandchecks = expandchecks
+	gc.Thearch.Getg = getg
 	gc.Thearch.Gins = gins
 	gc.Thearch.Ginscon = ginscon
 	gc.Thearch.Ginsnop = ginsnop
diff --git a/src/cmd/6g/ggen.go b/src/cmd/6g/ggen.go
index e609d0e..ceeec25 100644
--- a/src/cmd/6g/ggen.go
+++ b/src/cmd/6g/ggen.go
@@ -671,3 +671,20 @@
 	}
 	return false
 }
+
+// res = runtime.getg()
+func getg(res *gc.Node) {
+	var n1 gc.Node
+	gc.Regalloc(&n1, res.Type, res)
+	mov := optoas(gc.OAS, gc.Types[gc.Tptr])
+	p := gins(mov, nil, &n1)
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = x86.REG_TLS
+	p = gins(mov, nil, &n1)
+	p.From = p.To
+	p.From.Type = obj.TYPE_MEM
+	p.From.Index = x86.REG_TLS
+	p.From.Scale = 1
+	gmove(&n1, res)
+	gc.Regfree(&n1)
+}
diff --git a/src/cmd/7g/galign.go b/src/cmd/7g/galign.go
index 1c50c21..36b54aa 100644
--- a/src/cmd/7g/galign.go
+++ b/src/cmd/7g/galign.go
@@ -62,6 +62,7 @@
 	gc.Thearch.Dodiv = dodiv
 	gc.Thearch.Excise = excise
 	gc.Thearch.Expandchecks = expandchecks
+	gc.Thearch.Getg = getg
 	gc.Thearch.Gins = gins
 	gc.Thearch.Ginscon = ginscon
 	gc.Thearch.Ginsnop = ginsnop
diff --git a/src/cmd/7g/ggen.go b/src/cmd/7g/ggen.go
index 0fc5854..94685d7 100644
--- a/src/cmd/7g/ggen.go
+++ b/src/cmd/7g/ggen.go
@@ -532,3 +532,10 @@
 		p2.To.Offset = 0
 	}
 }
+
+// res = runtime.getg()
+func getg(res *gc.Node) {
+	var n1 gc.Node
+	gc.Nodreg(&n1, res.Type, arm64.REGG)
+	gmove(&n1, res)
+}
diff --git a/src/cmd/8g/galign.go b/src/cmd/8g/galign.go
index 1c03df5..2a8e0b7 100644
--- a/src/cmd/8g/galign.go
+++ b/src/cmd/8g/galign.go
@@ -77,6 +77,7 @@
 	gc.Thearch.Dodiv = cgen_div
 	gc.Thearch.Excise = excise
 	gc.Thearch.Expandchecks = expandchecks
+	gc.Thearch.Getg = getg
 	gc.Thearch.Gins = gins
 	gc.Thearch.Ginscon = ginscon
 	gc.Thearch.Ginsnop = ginsnop
diff --git a/src/cmd/8g/ggen.go b/src/cmd/8g/ggen.go
index 115c962..9a551b0 100644
--- a/src/cmd/8g/ggen.go
+++ b/src/cmd/8g/ggen.go
@@ -944,3 +944,20 @@
 	}
 	return false
 }
+
+// res = runtime.getg()
+func getg(res *gc.Node) {
+	var n1 gc.Node
+	gc.Regalloc(&n1, res.Type, res)
+	mov := optoas(gc.OAS, gc.Types[gc.Tptr])
+	p := gins(mov, nil, &n1)
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = x86.REG_TLS
+	p = gins(mov, nil, &n1)
+	p.From = p.To
+	p.From.Type = obj.TYPE_MEM
+	p.From.Index = x86.REG_TLS
+	p.From.Scale = 1
+	gmove(&n1, res)
+	gc.Regfree(&n1)
+}
diff --git a/src/cmd/9g/galign.go b/src/cmd/9g/galign.go
index b9e6c32..68eab58 100644
--- a/src/cmd/9g/galign.go
+++ b/src/cmd/9g/galign.go
@@ -71,6 +71,7 @@
 	gc.Thearch.Dodiv = dodiv
 	gc.Thearch.Excise = excise
 	gc.Thearch.Expandchecks = expandchecks
+	gc.Thearch.Getg = getg
 	gc.Thearch.Gins = gins
 	gc.Thearch.Ginscon = ginscon
 	gc.Thearch.Ginsnop = ginsnop
diff --git a/src/cmd/9g/ggen.go b/src/cmd/9g/ggen.go
index a009186..9af36a1 100644
--- a/src/cmd/9g/ggen.go
+++ b/src/cmd/9g/ggen.go
@@ -549,3 +549,10 @@
 		p2.To.Offset = 0
 	}
 }
+
+// res = runtime.getg()
+func getg(res *gc.Node) {
+	var n1 gc.Node
+	gc.Nodreg(&n1, res.Type, ppc64.REGG)
+	gmove(&n1, res)
+}
diff --git a/src/cmd/internal/gc/cgen.go b/src/cmd/internal/gc/cgen.go
index 3b628ac..886a2d1 100644
--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go
@@ -418,6 +418,10 @@
 		Regfree(&n1)
 		return
 
+	case OGETG:
+		Thearch.Getg(res)
+		return
+
 		// symmetric binary
 	case OAND,
 		OOR,
diff --git a/src/cmd/internal/gc/fmt.go b/src/cmd/internal/gc/fmt.go
index 38e358a..589f20c 100644
--- a/src/cmd/internal/gc/fmt.go
+++ b/src/cmd/internal/gc/fmt.go
@@ -976,6 +976,7 @@
 	OCONV:         8,
 	OCOPY:         8,
 	ODELETE:       8,
+	OGETG:         8,
 	OLEN:          8,
 	OLITERAL:      8,
 	OMAKESLICE:    8,
@@ -1363,7 +1364,7 @@
 		}
 		return fmt.Sprintf("%v(%v)", Oconv(int(n.Op), obj.FmtSharp), Hconv(n.List, obj.FmtComma))
 
-	case OCALL, OCALLFUNC, OCALLINTER, OCALLMETH:
+	case OCALL, OCALLFUNC, OCALLINTER, OCALLMETH, OGETG:
 		var f string
 		f += exprfmt(n.Left, nprec)
 		if n.Isddd {
diff --git a/src/cmd/internal/gc/gen.go b/src/cmd/internal/gc/gen.go
index e52ff65..4b7344a 100644
--- a/src/cmd/internal/gc/gen.go
+++ b/src/cmd/internal/gc/gen.go
@@ -1002,6 +1002,10 @@
 	case ORETURN, ORETJMP:
 		cgen_ret(n)
 
+	// Function calls turned into compiler intrinsics.
+	// At top level, can just ignore the call and make sure to preserve side effects in the argument, if any.
+	case OGETG:
+		// nothing
 	case OSQRT:
 		cgen_discard(n.Left)
 
diff --git a/src/cmd/internal/gc/go.go b/src/cmd/internal/gc/go.go
index 027ad28..a6faaa5 100644
--- a/src/cmd/internal/gc/go.go
+++ b/src/cmd/internal/gc/go.go
@@ -798,6 +798,7 @@
 	Dodiv        func(int, *Node, *Node, *Node)
 	Excise       func(*Flow)
 	Expandchecks func(*obj.Prog)
+	Getg         func(*Node)
 	Gins         func(int, *Node, *Node) *obj.Prog
 	Ginscon      func(int, int64, *Node)
 	Ginsnop      func()
diff --git a/src/cmd/internal/gc/syntax.go b/src/cmd/internal/gc/syntax.go
index 736c7af..11cdf29 100644
--- a/src/cmd/internal/gc/syntax.go
+++ b/src/cmd/internal/gc/syntax.go
@@ -304,6 +304,7 @@
 	ORETJMP // return to other function
 	OPS     // compare parity set (for x86 NaN check)
 	OSQRT   // sqrt(float64), on systems that have hw support
+	OGETG   // runtime.getg() (read g pointer)
 
 	OEND
 )
diff --git a/src/cmd/internal/gc/typecheck.go b/src/cmd/internal/gc/typecheck.go
index 4399164..08262b1 100644
--- a/src/cmd/internal/gc/typecheck.go
+++ b/src/cmd/internal/gc/typecheck.go
@@ -1366,6 +1366,17 @@
 				t = t.Type
 			}
 			n.Type = t
+
+			if n.Op == OCALLFUNC && n.Left.Op == ONAME && (compiling_runtime != 0 || n.Left.Sym.Pkg == Runtimepkg) && n.Left.Sym.Name == "getg" {
+				// Emit code for runtime.getg() directly instead of calling function.
+				// Most such rewrites (for example the similar one for math.Sqrt) should be done in walk,
+				// so that the ordering pass can make sure to preserve the semantics of the original code
+				// (in particular, the exact time of the function call) by introducing temporaries.
+				// In this case, we know getg() always returns the same result within a given function
+				// and we want to avoid the temporaries, so we do the rewrite earlier than is typical.
+				n.Op = OGETG
+			}
+
 			break OpSwitch
 		}
 
@@ -1376,6 +1387,7 @@
 		}
 
 		n.Type = getoutargx(l.Type)
+
 		break OpSwitch
 
 	case OCAP, OLEN, OREAL, OIMAG:
diff --git a/src/cmd/internal/gc/walk.go b/src/cmd/internal/gc/walk.go
index bf91116..1012aa0 100644
--- a/src/cmd/internal/gc/walk.go
+++ b/src/cmd/internal/gc/walk.go
@@ -179,7 +179,8 @@
 		OPRINTN,
 		OPANIC,
 		OEMPTY,
-		ORECOVER:
+		ORECOVER,
+		OGETG:
 		if n.Typecheck == 0 {
 			Fatal("missing typecheck: %v", Nconv(n, obj.FmtSign))
 		}
@@ -424,7 +425,8 @@
 		ONONAME,
 		OINDREG,
 		OEMPTY,
-		OPARAM:
+		OPARAM,
+		OGETG:
 		goto ret
 
 	case ONOT,
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index bee8b29..f2222d0 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -1639,12 +1639,6 @@
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-TEXT runtime·getg(SB),NOSPLIT,$0-4
-	get_tls(CX)
-	MOVL	g(CX), AX
-	MOVL	AX, ret+0(FP)
-	RET
-
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
 	MOVL	addr+0(FP), AX
 	PREFETCHT0	(AX)
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 946e151..0e5389f 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1673,12 +1673,6 @@
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-TEXT runtime·getg(SB),NOSPLIT,$0-8
-	get_tls(CX)
-	MOVQ	g(CX), AX
-	MOVQ	AX, ret+0(FP)
-	RET
-
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
 	MOVQ	addr+0(FP), AX
 	PREFETCHT0	(AX)
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index e144c40..23e2cb9 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -1096,12 +1096,6 @@
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-TEXT runtime·getg(SB),NOSPLIT,$0-4
-	get_tls(CX)
-	MOVL	g(CX), AX
-	MOVL	AX, ret+0(FP)
-	RET
-
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
 	MOVL	addr+0(FP), AX
 	PREFETCHT0	(AX)
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index a2e1e4b..b7042ea 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -984,10 +984,6 @@
 	// traceback from goexit1 must hit code range of goexit
 	MOVW	R0, R0	// NOP
 
-TEXT runtime·getg(SB),NOSPLIT,$-4-4
-	MOVW	g, ret+0(FP)
-	RET
-
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
 	RET
 
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 3c09d53..0b21a1d 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -903,10 +903,6 @@
 	MOVD	R0, R0	// NOP
 	BL	runtime·goexit1(SB)	// does not return
 
-TEXT runtime·getg(SB),NOSPLIT,$-8-8
-	MOVD	g, ret+0(FP)
-	RET
-
 // TODO(aram): use PRFM here.
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
 	RET
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index ef64050..5b7ad41 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1134,10 +1134,6 @@
 	// traceback from goexit1 must hit code range of goexit
 	MOVD	R0, R0	// NOP
 
-TEXT runtime·getg(SB),NOSPLIT,$-8-8
-	MOVD	g, ret+0(FP)
-	RETURN
-
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
 	RETURN
 
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 99d8dd4..7b6fbb0 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -18,6 +18,9 @@
 	return unsafe.Pointer(uintptr(p) + x)
 }
 
+// getg returns the pointer to the current g.
+// The compiler rewrites calls to this function into instructions
+// that fetch the g directly (from TLS or from the dedicated register).
 func getg() *g
 
 // mcall switches from the g to the g0 stack and invokes fn(g),