cmd/compile: added some intrinsics to SSA back end

One intrinsic was needed to help get the very best
performance out of a future GC; as long as that one was
being added, I also added Bswap since that is sometimes
a handy thing to have.  I had intended to fill out the
bit-scan intrinsic family, but the mismatch between the
"scan forward" instruction and "count leading zeroes"
was large enough to cause me to leave it out -- it poses
a dilemma that I'd rather dodge right now.

These intrinsics are not exposed for general use.
That's a separate issue requiring an API proposal change
( https://github.com/golang/proposal )

All intrinsics are tested, both that they are substituted
on the appropriate architecture, and that they produce the
expected result.

Change-Id: I5848037cfd97de4f75bdc33bdd89bba00af4a8ee
Reviewed-on: https://go-review.googlesource.com/20564
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/cmd/compile/internal/gc/go.go b/src/cmd/compile/internal/gc/go.go
index 78c177e..448a0fd 100644
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -55,8 +55,8 @@
 }
 
 type Pkg struct {
-	Name     string // package name
-	Path     string // string literal used in import statement
+	Name     string // package name, e.g. "sys"
+	Path     string // string literal used in import statement, e.g. "runtime/internal/sys"
 	Pathsym  *Sym
 	Prefix   string // escaped path for use in symbol table
 	Imported bool   // export data of this package was parsed
@@ -469,6 +469,9 @@
 
 	// Set, use, or kill of carry bit.
 	// Kill means we never look at the carry bit after this kind of instruction.
+	// Originally for understanding ADC, RCR, and so on, but now also
+	// tracks set, use, and kill of the zero and overflow bits as well.
+	// TODO rename to {Set,Use,Kill}Flags
 	SetCarry  = 1 << 24
 	UseCarry  = 1 << 25
 	KillCarry = 1 << 26
diff --git a/src/cmd/compile/internal/gc/inl.go b/src/cmd/compile/internal/gc/inl.go
index ff0791c..e25ce13 100644
--- a/src/cmd/compile/internal/gc/inl.go
+++ b/src/cmd/compile/internal/gc/inl.go
@@ -453,7 +453,7 @@
 		if Debug['m'] > 3 {
 			fmt.Printf("%v:call to func %v\n", n.Line(), Nconv(n.Left, FmtSign))
 		}
-		if n.Left.Func != nil && len(n.Left.Func.Inl.Slice()) != 0 { // normal case
+		if n.Left.Func != nil && len(n.Left.Func.Inl.Slice()) != 0 && !isIntrinsicCall1(n) { // normal case
 			n = mkinlcall(n, n.Left, n.Isddd)
 		} else if n.Left.Op == ONAME && n.Left.Left != nil && n.Left.Left.Op == OTYPE && n.Left.Right != nil && n.Left.Right.Op == ONAME { // methods called as functions
 			if n.Left.Sym.Def != nil {
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 93b820b..9b8ef20 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -2052,7 +2052,13 @@
 		p, l, c := s.slice(n.Left.Type, v, i, j, k)
 		return s.newValue3(ssa.OpSliceMake, n.Type, p, l, c)
 
-	case OCALLFUNC, OCALLINTER, OCALLMETH:
+	case OCALLFUNC:
+		if isIntrinsicCall1(n) {
+			return s.intrinsicCall1(n)
+		}
+		fallthrough
+
+	case OCALLINTER, OCALLMETH:
 		a := s.call(n, callNormal)
 		return s.newValue2(ssa.OpLoad, n.Type, a, s.mem())
 
@@ -2373,6 +2379,75 @@
 	callGo
 )
 
+// isSSAIntrinsic1 returns true if n is a call to a recognized 1-arg intrinsic
+// that can be handled by the SSA backend.
+// SSA uses this, but so does the front end to see if should not
+// inline a function because it is a candidate for intrinsic
+// substitution.
+func isSSAIntrinsic1(s *Sym) bool {
+	// The test below is not quite accurate -- in the event that
+	// a function is disabled on a per-function basis, for example
+	// because of hash-keyed binary failure search, SSA might be
+	// disabled for that function but it would not be noted here,
+	// and thus an inlining would not occur (in practice, inlining
+	// so far has only been noticed for Bswap32 and the 16-bit count
+	// leading/trailing instructions, but heuristics might change
+	// in the future or on different architectures).
+	if !ssaEnabled || ssa.IntrinsicsDisable || Thearch.Thechar != '6' {
+		return false
+	}
+	if s != nil && s.Pkg != nil && s.Pkg.Path == "runtime/internal/sys" {
+		switch s.Name {
+		case
+			"Ctz64", "Ctz32", "Ctz16",
+			"Bswap64", "Bswap32":
+			return true
+		}
+	}
+	return false
+}
+
+func isIntrinsicCall1(n *Node) bool {
+	if n == nil || n.Left == nil {
+		return false
+	}
+	return isSSAIntrinsic1(n.Left.Sym)
+}
+
+// intrinsicFirstArg extracts arg from n.List and eval
+func (s *state) intrinsicFirstArg(n *Node) *ssa.Value {
+	x := n.List.First()
+	if x.Op == OAS {
+		x = x.Right
+	}
+	return s.expr(x)
+}
+
+// intrinsicCall1 converts a call to a recognized 1-arg intrinsic
+// into the intrinsic
+func (s *state) intrinsicCall1(n *Node) *ssa.Value {
+	var result *ssa.Value
+	switch n.Left.Sym.Name {
+	case "Ctz64":
+		result = s.newValue1(ssa.OpCtz64, Types[TUINT64], s.intrinsicFirstArg(n))
+	case "Ctz32":
+		result = s.newValue1(ssa.OpCtz32, Types[TUINT32], s.intrinsicFirstArg(n))
+	case "Ctz16":
+		result = s.newValue1(ssa.OpCtz16, Types[TUINT16], s.intrinsicFirstArg(n))
+	case "Bswap64":
+		result = s.newValue1(ssa.OpBswap64, Types[TUINT64], s.intrinsicFirstArg(n))
+	case "Bswap32":
+		result = s.newValue1(ssa.OpBswap32, Types[TUINT32], s.intrinsicFirstArg(n))
+	}
+	if result == nil {
+		Fatalf("Unknown special call: %v", n.Left.Sym)
+	}
+	if ssa.IntrinsicsDebug > 0 {
+		Warnl(n.Lineno, "intrinsic substitution for %v with %s", n.Left.Sym.Name, result.LongString())
+	}
+	return result
+}
+
 // Calls the function n using the specified call type.
 // Returns the address of the return value (or nil if none).
 func (s *state) call(n *Node, k callKind) *ssa.Value {