cmd/internal/gc: use hardware instruction for math.Sqrt (amd64/arm)

I first prototyped this change in Sept 2011, and I discarded it
because it made no difference in the obvious benchmark loop.
It still makes no difference in the obvious benchmark loop,
but in a less obvious one, doing some extra computation
around the calls to Sqrt, not making the call does have a
significant effect.

benchmark                 old ns/op     new ns/op     delta
BenchmarkSqrt             4.56          4.57          +0.22%
BenchmarkSqrtIndirect     4.56          4.56          +0.00%
BenchmarkSqrtGo           69.4          69.4          +0.00%
BenchmarkSqrtPrime        4417          3647          -17.43%

This is a warmup for using hardware expansions for some
calls to 1-line assembly routines in the runtime (for example getg).

Change-Id: Ie66be23f8c09d0f7dc4ddd7ca8a93cfce28f55a4
Reviewed-on: https://go-review.googlesource.com/8356
Reviewed-by: Rob Pike <r@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/cmd/5g/gsubr.go b/src/cmd/5g/gsubr.go
index 0d22f74..fe4ed8d 100644
--- a/src/cmd/5g/gsubr.go
+++ b/src/cmd/5g/gsubr.go
@@ -1055,6 +1055,9 @@
 
 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = arm.ADIVD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = arm.ASQRTD
 	}
 
 	return a
diff --git a/src/cmd/5g/peep.go b/src/cmd/5g/peep.go
index 5305e4b..9ec3be2 100644
--- a/src/cmd/5g/peep.go
+++ b/src/cmd/5g/peep.go
@@ -1101,6 +1101,7 @@
 		return 0
 
 	case obj.ANOP, /* read,, write */
+		arm.ASQRTD,
 		arm.AMOVW,
 		arm.AMOVF,
 		arm.AMOVD,
diff --git a/src/cmd/5g/prog.go b/src/cmd/5g/prog.go
index bfb703e..c472cdf 100644
--- a/src/cmd/5g/prog.go
+++ b/src/cmd/5g/prog.go
@@ -70,16 +70,17 @@
 	arm.ATST:    {gc.SizeL | gc.LeftRead | gc.RightRead, 0, 0, 0},
 
 	// Floating point.
-	arm.AADDD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AADDF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ACMPD: {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ACMPF: {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ADIVD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ADIVF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ACMPD:  {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ACMPF:  {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ADIVD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ADIVF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASQRTD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 
 	// Conversions.
 	arm.AMOVWD: {gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv, 0, 0, 0},
diff --git a/src/cmd/6g/gsubr.go b/src/cmd/6g/gsubr.go
index b2290af..323ea69 100644
--- a/src/cmd/6g/gsubr.go
+++ b/src/cmd/6g/gsubr.go
@@ -1131,6 +1131,9 @@
 
 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = x86.ADIVSD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = x86.ASQRTSD
 	}
 
 	return a
diff --git a/src/cmd/6g/prog.go b/src/cmd/6g/prog.go
index 0644800..fe9f013 100644
--- a/src/cmd/6g/prog.go
+++ b/src/cmd/6g/prog.go
@@ -204,6 +204,7 @@
 	x86.ASHRL:     {gc.SizeL | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRQ:     {gc.SizeQ | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRW:     {gc.SizeW | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
+	x86.ASQRTSD:   {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 	x86.ASTOSB:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSL:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSQ:    {gc.OK, AX | DI, DI, 0},
diff --git a/src/cmd/internal/gc/cgen.go b/src/cmd/internal/gc/cgen.go
index b3524c2..d3921f7 100644
--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go
@@ -409,6 +409,15 @@
 		cgen_norm(n, &n1, res)
 		return
 
+	case OSQRT:
+		var n1 Node
+		Regalloc(&n1, nl.Type, res)
+		Cgen(n.Left, &n1)
+		Thearch.Gins(Thearch.Optoas(OSQRT, nl.Type), &n1, &n1)
+		Thearch.Gmove(&n1, res)
+		Regfree(&n1)
+		return
+
 		// symmetric binary
 	case OAND,
 		OOR,
diff --git a/src/cmd/internal/gc/gen.go b/src/cmd/internal/gc/gen.go
index caae2f1..e0659fc 100644
--- a/src/cmd/internal/gc/gen.go
+++ b/src/cmd/internal/gc/gen.go
@@ -1002,6 +1002,9 @@
 	case ORETURN, ORETJMP:
 		cgen_ret(n)
 
+	case OSQRT:
+		cgen_discard(n.Left)
+
 	case OCHECKNIL:
 		Cgen_checknil(n.Left)
 
diff --git a/src/cmd/internal/gc/syntax.go b/src/cmd/internal/gc/syntax.go
index 8f5b85d..671a624 100644
--- a/src/cmd/internal/gc/syntax.go
+++ b/src/cmd/internal/gc/syntax.go
@@ -293,7 +293,7 @@
 	OREGISTER // a register, such as AX.
 	OINDREG   // offset plus indirect of a register, such as 8(SP).
 
-	// 386/amd64-specific opcodes
+	// arch-specific opcodes
 	OCMP    // compare: ACMP.
 	ODEC    // decrement: ADEC.
 	OINC    // increment: AINC.
@@ -303,6 +303,7 @@
 	ORROTC  // right rotate-carry: ARCR.
 	ORETJMP // return to other function
 	OPS     // compare parity set (for x86 NaN check)
+	OSQRT   // sqrt(float64), on systems that have hw support
 
 	OEND
 )
diff --git a/src/cmd/internal/gc/walk.go b/src/cmd/internal/gc/walk.go
index c10201a..a0a29d3 100644
--- a/src/cmd/internal/gc/walk.go
+++ b/src/cmd/internal/gc/walk.go
@@ -622,6 +622,16 @@
 		walkexpr(&n.Left, init)
 		walkexprlist(n.List, init)
 
+		if n.Left.Op == ONAME && n.Left.Sym.Name == "Sqrt" && n.Left.Sym.Pkg.Path == "math" {
+			switch Thearch.Thechar {
+			case '5', '6':
+				n.Op = OSQRT
+				n.Left = n.List.N
+				n.List = nil
+				goto ret
+			}
+		}
+
 		ll := ascompatte(int(n.Op), n, n.Isddd, getinarg(t), n.List, 0, init)
 		n.List = reorder1(ll)
 		goto ret