gollvm: support target CPU/features (-march=XXX, etc)

This patch adds real support for the "-march=" command line option, to
allow users to have the back end target a specific CPU (ex: 'nehalem')
within a given architecture (ex: x86-64). Each specific cpu/arch
selection gets translated into a set of "target feature" attributes
that are then attached to each LLVM function; specifying the correct
attributes is key to insuring that vectorization works properly.

Notes:

- creates a new Go helper program, "capture-fcn-attributes.go", that
  developers can build and run to generate a header containing the
  correct set of fcn attributes to use for specific CPUs/archs. The
  expectation is that building and running this Go program is
  something done by gollvm developers offline (as opposed to having it
  be part of the cmake/ninja build process each time gollvm is built).
  The helper program works by capturing the behavior of clang:
  running clang on an input file for specific -march=... values
  and then looking at the generated IR.

- unlike clang, gollvm won't support (at least initially) flags for
  turning on/off individual features/instructions, since this would
  add a great deal of complexity.

Change-Id: I99abf703608e0d1f5d00f6becc3f43cbdb60b394
Reviewed-on: https://go-review.googlesource.com/118322
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/bridge/go-llvm.cpp b/bridge/go-llvm.cpp
index 9d8db39..a6ffb3e 100644
--- a/bridge/go-llvm.cpp
+++ b/bridge/go-llvm.cpp
@@ -141,6 +141,16 @@
   dibuildhelper_->addDebugPrefix(prefix);
 }
 
+void Llvm_backend::setTargetCpuAttr(const std::string &cpu)
+{
+  targetCpuAttr_ = cpu;
+}
+
+void Llvm_backend::setTargetFeaturesAttr(const std::string &attrs)
+{
+  targetFeaturesAttr_ = attrs;
+}
+
 void
 Llvm_backend::verifyModule()
 {
@@ -2456,6 +2466,10 @@
     if (no_return)
       fcn->addFnAttr(llvm::Attribute::NoReturn);
 
+    // attributes for target CPU and features
+    fcn->addFnAttr("target-cpu", targetCpuAttr_);
+    fcn->addFnAttr("target-features", targetFeaturesAttr_);
+
     fcnValue = fcn;
 
     // Fix up references to declaration of old type.
diff --git a/bridge/go-llvm.h b/bridge/go-llvm.h
index 06bf950..ab02d80 100644
--- a/bridge/go-llvm.h
+++ b/bridge/go-llvm.h
@@ -397,6 +397,10 @@
   // Disable frame pointer elimination if set to true.
   void setNoFpElim(bool b) { noFpElim_ = b; };
 
+  // Target CPU and features
+  void setTargetCpuAttr(const std::string &cpu);
+  void setTargetFeaturesAttr(const std::string &attrs);
+
   // Personality function
   llvm::Function *personalityFunction();
 
@@ -818,6 +822,10 @@
 
   // Personality function
   llvm::Function *personalityFunction_;
+
+  // Target cpu and attributes to be attached to any generated fcns.
+  std::string targetCpuAttr_;
+  std::string targetFeaturesAttr_;
 };
 
 #endif
diff --git a/driver/ArchCpusAttrs.h b/driver/ArchCpusAttrs.h
new file mode 100644
index 0000000..7064d9e
--- /dev/null
+++ b/driver/ArchCpusAttrs.h
@@ -0,0 +1,74 @@
+// DO NOT EDIT: this file auto-generated by the following command:
+//
+//    ./capture-fcn-attributes -o ArchCpusAttrs.h -triples x86_64-unknown-linux-gnu
+//
+// in combination with clang:
+//
+//  clang version 7.0.0 (trunk 333637) (llvm/trunk 333650)
+//
+
+typedef struct {
+  const char *cpu;
+  const char *attrs;
+} CpuAttrs;
+
+typedef struct {
+  const char *triple;
+  const CpuAttrs *cpuattrs;
+} TripleCpus;
+
+// triple: x86_64-unknown-linux-gnu
+static const CpuAttrs attrs0[] = {
+  // first entry is default cpu
+  { "x86-64", "+fxsr,+mmx,+sse,+sse2,+x87" },
+  { "amdfam10", "+3dnow,+3dnowa,+fxsr,+lzcnt,+mmx,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4a,+x87" },
+  { "athlon-fx", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+x87" },
+  { "athlon64", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+x87" },
+  { "athlon64-sse3", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+sse3,+x87" },
+  { "atom", "+cx16,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" },
+  { "barcelona", "+3dnow,+3dnowa,+fxsr,+lzcnt,+mmx,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4a,+x87" },
+  { "bdver1", "+aes,+avx,+cx16,+fma4,+fxsr,+lwp,+lzcnt,+mmx,+pclmul,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+x87,+xop,+xsave" },
+  { "bdver2", "+aes,+avx,+bmi,+cx16,+f16c,+fma,+fma4,+fxsr,+lwp,+lzcnt,+mmx,+pclmul,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+tbm,+x87,+xop,+xsave" },
+  { "bdver3", "+aes,+avx,+bmi,+cx16,+f16c,+fma,+fma4,+fsgsbase,+fxsr,+lwp,+lzcnt,+mmx,+pclmul,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+tbm,+x87,+xop,+xsave,+xsaveopt" },
+  { "bdver4", "+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fma4,+fsgsbase,+fxsr,+lwp,+lzcnt,+mmx,+mwaitx,+pclmul,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+tbm,+x87,+xop,+xsave,+xsaveopt" },
+  { "bonnell", "+cx16,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" },
+  { "broadwell", "+adx,+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "btver1", "+cx16,+fxsr,+lzcnt,+mmx,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4a,+ssse3,+x87" },
+  { "btver2", "+aes,+avx,+bmi,+cx16,+f16c,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "cannonlake", "+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vl,+bmi,+bmi2,+clflushopt,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sgx,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "core-avx-i", "+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "core-avx2", "+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "core2", "+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" },
+  { "corei7", "+cx16,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" },
+  { "corei7-avx", "+aes,+avx,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "goldmont", "+aes,+clflushopt,+cx16,+fsgsbase,+fxsr,+mmx,+movbe,+mpx,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "goldmont-plus", "+aes,+clflushopt,+cx16,+fsgsbase,+fxsr,+mmx,+movbe,+mpx,+pclmul,+popcnt,+prfchw,+ptwrite,+rdpid,+rdrnd,+rdseed,+sahf,+sgx,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "haswell", "+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "icelake-client", "+adx,+aes,+avx,+avx2,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdrnd,+rdseed,+rtm,+sahf,+sgx,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "icelake-server", "+adx,+aes,+avx,+avx2,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pconfig,+pku,+popcnt,+prfchw,+rdpid,+rdrnd,+rdseed,+rtm,+sahf,+sgx,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "ivybridge", "+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "k8", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+x87" },
+  { "k8-sse3", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+sse3,+x87" },
+  { "knl", "+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "knm", "+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+avx512vpopcntdq,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "nehalem", "+cx16,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" },
+  { "nocona", "+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+x87" },
+  { "opteron", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+x87" },
+  { "opteron-sse3", "+3dnow,+3dnowa,+fxsr,+mmx,+prfchw,+sse,+sse2,+sse3,+x87" },
+  { "penryn", "+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" },
+  { "sandybridge", "+aes,+avx,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" },
+  { "silvermont", "+aes,+cx16,+fxsr,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" },
+  { "skx", "+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "skylake", "+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "skylake-avx512", "+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+rtm,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "slm", "+aes,+cx16,+fxsr,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" },
+  { "tremont", "+aes,+cldemote,+clflushopt,+cx16,+fsgsbase,+fxsr,+gfni,+mmx,+movbe,+movdir64b,+movdiri,+mpx,+pclmul,+popcnt,+prfchw,+ptwrite,+rdpid,+rdrnd,+rdseed,+sahf,+sgx,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+waitpkg,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "westmere", "+aes,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" },
+  { "znver1", "+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clzero,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" },
+  { "", "" } // sentinel
+};
+
+const TripleCpus triples[] = {
+  { "x86_64-unknown-linux-gnu", &attrs0[0] },
+  { "", nullptr } // sentinel
+};
diff --git a/driver/CompileGo.cpp b/driver/CompileGo.cpp
index 95d1696..e86a726 100644
--- a/driver/CompileGo.cpp
+++ b/driver/CompileGo.cpp
@@ -26,6 +26,10 @@
 #include "Driver.h"
 #include "ToolChain.h"
 
+namespace gollvm { namespace arch {
+#include "ArchCpusAttrs.h"
+} }
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -99,6 +103,8 @@
   std::string asmOutFileName_;
   std::unique_ptr<ToolOutputFile> asmout_;
   std::unique_ptr<TargetLibraryInfoImpl> tlii_;
+  std::string targetCpuAttr_;
+  std::string targetFeaturesAttr_;
 
   void createPasses(legacy::PassManager &MPM,
                     legacy::FunctionPassManager &FPM);
@@ -374,9 +380,9 @@
     return false;
   Options.AllowFPOpFusion = *dofuse;
 
-  // Support -mcpu
+  // Support -march
   std::string cpuStr;
-  opt::Arg *cpuarg = args_.getLastArg(gollvm::options::OPT_mcpu_EQ);
+  opt::Arg *cpuarg = args_.getLastArg(gollvm::options::OPT_march_EQ);
   if (cpuarg != nullptr) {
     std::string val(cpuarg->getValue());
     if (val == "native")
@@ -385,16 +391,45 @@
       cpuStr = cpuarg->getValue();
   }
 
-  // Features.
-  // FIXME: incorporate command line flags.
-  SubtargetFeatures features;
-  features.getDefaultSubtargetFeatures(triple_);
-  std::string featStr = features.getString();
+  // Locate correct entry in architectures table for this triple
+  const gollvm::arch::CpuAttrs *cpuAttrs = nullptr;
+  for (unsigned i = 0; gollvm::arch::triples[i].cpuattrs != nullptr; i += 1) {
+    if (!strcmp(triple_.str().c_str(), gollvm::arch::triples[i].triple)) {
+      cpuAttrs = gollvm::arch::triples[i].cpuattrs;
+      break;
+    }
+  }
+  if (cpuAttrs == nullptr) {
+    errs() << progname_ << ": unable to determine target CPU features for "
+           << "target " << triple_.str() << "\n";
+    return false;
+  }
+
+  // If no CPU specified, use first entry. Otherwise look for CPU name.
+  if (!cpuStr.empty()) {
+    bool found = false;
+    while (strlen(cpuAttrs->cpu) != 0) {
+      if (!strcmp(cpuAttrs->cpu, cpuStr.c_str())) {
+        // found
+        found = true;
+        break;
+      }
+      cpuAttrs++;
+    }
+    if (!found) {
+      errs() << progname_ << ": invalid setting for -march:"
+             << " -- unable to identify CPU '" << cpuStr << "'\n";
+      return false;
+    }
+  }
+  targetCpuAttr_ = cpuAttrs->cpu;
+  targetFeaturesAttr_ = cpuAttrs->attrs;
 
   // Create target machine
   Optional<llvm::CodeModel::Model> CM = None;
   target_.reset(
-      TheTarget->createTargetMachine(triple_.getTriple(), cpuStr, featStr,
+      TheTarget->createTargetMachine(triple_.getTriple(),
+                                     targetCpuAttr_, targetFeaturesAttr_,
                                      Options, driver_.reconcileRelocModel(),
                                      CM, cgolvl_));
   assert(target_.get() && "Could not allocate target machine!");
@@ -435,6 +470,8 @@
     return false;
   bridge_->setTraceLevel(*tl);
   bridge_->setNoInline(args_.hasArg(gollvm::options::OPT_fno_inline));
+  bridge_->setTargetCpuAttr(targetCpuAttr_);
+  bridge_->setTargetFeaturesAttr(targetFeaturesAttr_);
 
   // -f[no-]omit-frame-pointer
   bool omitFp =
diff --git a/driver/Driver.cpp b/driver/Driver.cpp
index d7893f2..529ca1b 100644
--- a/driver/Driver.cpp
+++ b/driver/Driver.cpp
@@ -289,18 +289,6 @@
   else
     triple_ = Triple(sys::getDefaultTargetTriple());
 
-  // Support -march
-  std::string archStr;
-  opt::Arg *archarg = args_.getLastArg(gollvm::options::OPT_march_EQ);
-  if (archarg != nullptr) {
-    std::string val(archarg->getValue());
-    if (val == "native")
-      archStr = sys::getHostCPUName();
-    else
-      archStr = archarg->getValue();
-    triple_.setArchName(archStr);
-  }
-
   // Honor -dumpmachine
   if (args_.hasArg(gollvm::options::OPT_dumpmachine)) {
     llvm::outs() << triple_.str() << "\n";
diff --git a/driver/capture-fcn-attributes.go b/driver/capture-fcn-attributes.go
new file mode 100644
index 0000000..c3b64f4
--- /dev/null
+++ b/driver/capture-fcn-attributes.go
@@ -0,0 +1,551 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// This bootstrap helper program emits C++ code that encapsulates
+// information about available CPUs for a given architecture. It works
+// by invoking clang and/or llc and inspecting trace output and
+// generated IR.   Note: the expectation is that this program will be
+// built and run "off line" to generate a header file that is then
+// checked in (as opposed to having it build and run as part of the
+// actual gollvm ninja/cmake build).
+//
+// The intent is to allow gollvm to support the "-march=XXX" flag in a
+// basic way without having to recreate/replicate all of the
+// architecture-specific machinery in the clang driver that deals
+// with feature flags and feature attributes for the available targets
+// (since this code is very complex).
+//
+// This general idea is that for a given target triple we want to
+// determine the set of legal values that can be supplied to
+// the -march=XXX command line option, along with the correct set
+// of feature attributes that apply for that cpu/arch (settings for
+// -mattr=YYY,ZZZ,...).
+//
+// The strategy is to first run clang and/or LLC using command line
+// options whose output (or error messages) list out available CPU
+// settings (either "llc -mcpu=help" or "clang -march=Illegal"
+// depending). Once the set of available CPUs is populated, we then
+// run clang with -emit-llvm and inspect the generated IR to collect
+// the set of attributes for each arch/cpu.
+//
+// Notes:
+// - not all versions of clang will produce a list of legal arch/cpu
+//   values when presented with an illegal -march value (this seems to be
+//   a recent development); this trick also doesn't seem to work
+//   when cross compiling (suppling --target=XXX to clang). For the
+//   cross-compile case, we fall back on running "llc".
+// - confusingly, llc's set of available CPUs is different
+//   from clang's available set of CPUs, so there has to be some
+//   weeding out of extra CPU names in some cases.
+//
+// Representative usage:
+//
+// % go build capture-fcn-attributes
+// % export PATH=<llvm bin dir>:$PATH
+// % ./capture-fcn-attributes -o HeaderFile.h -triples x86_64-unknown-linux-gnu
+// %
+
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+const prog = `
+#include <inttypes.h>
+typedef struct  {
+  uint64_t a[512];
+  uint64_t b[512];
+  uint64_t c[512];
+} vstuff;
+void Add512(vstuff *v) {
+  for (unsigned i = 0; i < 512; ++i) {
+    v->c[i] = v->a[i] + v->b[i];
+  }
+}
+`
+
+var (
+	noclflag      = flag.Bool("noclean", false, "Don't clean temp dir")
+	verbflag      = flag.Int("v", 0, "Verbose trace output level")
+	cpuflag       = flag.String("cpu", "", "Generate for specified cpu or cpus")
+	triplesflag   = flag.String("triples", "", "Select target triple(s)")
+	outfileflag   = flag.String("o", "", "Output file")
+	exitst        int
+	defaultTriple string
+)
+
+func verb(vlevel int, s string, a ...interface{}) {
+	if *verbflag >= vlevel {
+		fmt.Printf(s, a...)
+		fmt.Printf("\n")
+	}
+}
+
+func warn(s string, a ...interface{}) {
+	fmt.Fprintf(os.Stderr, s, a...)
+	fmt.Fprintf(os.Stderr, "\n")
+	exitst = 1
+}
+
+func fatal(s string, a ...interface{}) {
+	log.Fatalf(s, a...)
+}
+
+func usage(msg string) {
+	if len(msg) > 0 {
+		fmt.Fprintf(os.Stderr, "error: %s\n", msg)
+	}
+	fmt.Fprintf(os.Stderr, "usage: capture-fcn-attributes [flags]\n")
+	flag.PrintDefaults()
+	os.Exit(2)
+}
+
+type result struct {
+	cpu       string
+	attrs     string
+	supported bool
+	def       bool
+}
+
+func tb(x bool) int {
+	if x {
+		return 1
+	}
+	return 0
+}
+
+type ByCpu []result
+
+func (a ByCpu) Len() int      { return len(a) }
+func (a ByCpu) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a ByCpu) Less(i, j int) bool {
+	if a[i].def != a[j].def {
+		return tb(a[j].def) < tb(a[i].def)
+	}
+	return a[i].cpu < a[j].cpu
+}
+
+var qurx = regexp.MustCompile(`^"(.+)"$`)
+
+func qutrim(s string) string {
+	sl := qurx.FindStringSubmatch(s)
+	if len(sl) == 2 {
+		return string(sl[1])
+	}
+	return s
+}
+
+// Attributes strings are of the form { X Y Z=W A=B "Q"="R" ... }
+
+func parseAttrs(raw string) (string, string) {
+	fields := strings.Fields(raw)
+	features := ""
+	cpu := ""
+	for _, f := range fields {
+		sl := strings.Split(f, "=")
+		if len(sl) != 2 {
+			continue
+		}
+		k, v := qutrim(sl[0]), qutrim(sl[1])
+		if k == "target-features" {
+			features = v
+		} else if k == "target-cpu" {
+			cpu = v
+		}
+	}
+	return cpu, features
+}
+
+// Function definitions in an LLVM IR dump have an attribute tag (#<num>);
+// we then look for an attribute declaration with the same number later
+// in the dump.
+
+func parseClangOut(r io.Reader) (string, string) {
+
+	// function def:
+	//    define dso_local void @Add512(%struct.vstuff*) #0 {
+	fcnr := regexp.MustCompile(`^define.*@Add512\(.*\)\s+#(\d)\s+{\s*$`)
+
+	// function attrs:
+	// attributes #0 = { nounwind uwtable "x"="y" .... }
+	attrr := regexp.MustCompile(`^attributes\s+#(\d+)\s+\=\s+{(.+)\}\s*$`)
+
+	rawattrs := ""
+	attrnum := int64(-1)
+	scanner := bufio.NewScanner(r)
+	for scanner.Scan() {
+		verb(3, "clangline is %s", scanner.Text())
+		sl := fcnr.FindSubmatch(scanner.Bytes())
+		if len(sl) == 2 {
+			at, serr := strconv.ParseInt(string(sl[1]), 10, 64)
+			if serr != nil {
+				fatal("problems matching %s in %s",
+					string(sl[1]), scanner.Text())
+			}
+			attrnum = at
+			verb(3, "=> attrnum is %d", attrnum)
+		}
+		if attrnum != int64(-1) {
+			sl := attrr.FindSubmatch(scanner.Bytes())
+			if len(sl) == 3 {
+				at, serr := strconv.ParseInt(string(sl[1]), 10, 64)
+				if serr != nil {
+					fatal("problems matching %s in %s",
+						string(sl[1]), scanner.Text())
+				}
+				verb(3, "=-= at = %v\n", at)
+				if at == attrnum {
+					rawattrs = string(sl[2])
+					verb(3, "=> found rawattrs %s", rawattrs)
+					break
+				}
+			}
+		}
+	}
+	if scanner.Err() != nil {
+		fatal("error scanning clang output: %v", scanner.Err())
+	}
+
+	if rawattrs == "" {
+		fatal("unable to locate fcn attrs in clang output")
+	}
+	return parseAttrs(rawattrs)
+}
+
+func parseClangOutFile(cloutfile string) (string, string) {
+	infile, err := os.Open(cloutfile)
+	if err != nil {
+		fatal("problems opening clang output file %s: %s", cloutfile, err)
+	}
+	return parseClangOut(infile)
+}
+
+// For debugging (not needed for final output)
+
+func emitClangCmdLine(tdir string, cpu string, clargs []string) {
+	f := filepath.Join(tdir, fmt.Sprintf("%s.clangcmd.txt", cpu))
+	outfile, err := os.OpenFile(f, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
+	if err != nil {
+		fatal("unable to open trace output file %s", f)
+	}
+	bw := bufio.NewWriter(outfile)
+	bw.WriteString("clang")
+	for _, arg := range clargs {
+		bw.WriteString(" ")
+		bw.WriteString(arg)
+	}
+	bw.WriteString("\n")
+	if err := bw.Flush(); err != nil {
+		fatal("error writing file %s: %v", f, err)
+	}
+	if err := outfile.Close(); err != nil {
+		fatal("error closing output file %s: %v", f, err)
+	}
+}
+
+func enumerateAttributes(triple string, tdir string, cpus []string, bw *bufio.Writer, tf string, idx int) {
+
+	verb(1, "enumerating attributes for %d cpus", len(cpus))
+
+	// First entry in the list needs to be the default CPU
+	ecpus := append([]string{""}, cpus...)
+
+	// Process the various CPUs in parallel
+	sema := make(chan struct{}, runtime.NumCPU()) // limit concurrency
+	rchan := make(chan result, runtime.NumCPU())
+	for _, cpu := range ecpus {
+		verb(1, "enumerate for cpu %s", cpu)
+
+		go func(cpu string) {
+			sema <- struct{}{}
+			defer func() {
+				<-sema
+			}()
+
+			// Invoke clang with proper arguments
+			lloutfile := filepath.Join(tdir, fmt.Sprintf("%s.ll", cpu))
+			clargs := []string{"-emit-llvm", "-S", "-o", lloutfile,
+				"-O3", "-Xclang", "-disable-llvm-passes", tf}
+			cpuarch := "arch"
+			if triple != "" {
+				clargs = append(clargs, fmt.Sprintf("--target=%s", triple))
+			}
+			if triple != defaultTriple {
+				cpuarch = "cpu"
+			}
+			if cpu != "" {
+				clargs = append(clargs, fmt.Sprintf("-m%s=%s", cpuarch, cpu))
+			}
+			emitClangCmdLine(tdir, cpu, clargs)
+			cmd := exec.Command("clang", clargs...)
+			output, cerr := cmd.CombinedOutput()
+			if cerr != nil {
+				if triple == "" {
+					warn("clang run failed: %s", output)
+					fatal("err = %v", cerr)
+				}
+				// Note the 'supported:false' (indicating that this CPU
+				// value is not viable).
+				rchan <- result{cpu: cpu, attrs: strings.Join(clargs, " "), supported: false, def: false}
+			} else {
+				// Sift through the output for attr set.
+				acpu, attrs := parseClangOutFile(lloutfile)
+				adef := false
+				if cpu == "" {
+					adef = true
+				}
+
+				// Send results on to the consume.
+				rchan <- result{cpu: acpu, attrs: attrs, supported: true, def: adef}
+			}
+		}(cpu)
+	}
+
+	// Read raw results.
+	visited := make(map[string]bool)
+	results := []result{}
+	for range ecpus {
+		r := <-rchan
+		verb(1, "result: %v", r)
+		if !r.supported {
+			continue
+		}
+		if _, ok := visited[r.cpu]; ok {
+			continue
+		}
+		visited[r.cpu] = true
+		results = append(results, r)
+	}
+
+	// Sort, then write to output
+	fmt.Fprintf(bw, "// triple: %s\n", triple)
+	fmt.Fprintf(bw, "static const CpuAttrs attrs%d[] = {\n", idx)
+	bw.WriteString("  // first entry is default cpu\n")
+	sort.Sort(ByCpu(results))
+	for i := 0; i < len(results); i++ {
+		r := results[i]
+		fmt.Fprintf(bw, "  { \"%s\", \"%s\" },\n", r.cpu, r.attrs)
+	}
+	bw.WriteString("  { \"\", \"\" } // sentinel\n")
+	bw.WriteString("};\n\n")
+}
+
+// Runs llc to determine default triple value.
+
+func collectDefaultTriple() string {
+	// Run llc to collect default triple
+	llcargs := []string{"--version"}
+	cmd := exec.Command("llc", llcargs...)
+	output, err := cmd.CombinedOutput()
+	verb(3, "llc output is: %s\n", string(output))
+	if err != nil {
+		fatal("llc --version failed")
+	}
+
+	rx := regexp.MustCompile(`^\s*Default target:\s+(\S+)\s*$`)
+	scanner := bufio.NewScanner(strings.NewReader(string(output)))
+	scanner.Split(bufio.ScanLines)
+	for scanner.Scan() {
+		verb(3, "llc line is: %s", scanner.Text())
+		asl := rx.FindSubmatch(scanner.Bytes())
+		if len(asl) == 2 {
+			return string(asl[1])
+		}
+	}
+
+	fatal("parsing of llc --version output failed")
+	return ""
+}
+
+func genCPUs(triple string) []string {
+
+	// If -cpu XXX then just use that.
+	if *cpuflag != "" {
+		return strings.Split(*cpuflag, ",")
+	}
+
+	// Alternatively, look at the output of "llc -mcpu=help" (running
+	// clang with "-march=IllegalBadVal" also has similar effects, but
+	// doesn't allow you to set the triple currently).
+
+	tgtopt := []string{"-mcpu=help", fmt.Sprintf("-mtriple=%s", triple)}
+	cmd := exec.Command("llc", tgtopt...)
+	output, cerr := cmd.CombinedOutput()
+	if cerr != nil {
+		warn("llc run failed: %s", output)
+		fatal("err = %v", cerr)
+	}
+	verb(3, "llc output is: %s\n", string(output))
+
+	// Parse the output
+	resultcpus := []string{""}
+	rw := regexp.MustCompile(`^\s*$`)
+	r1 := regexp.MustCompile(`^Available (\S+) for this target:\s*$`)
+	r2 := regexp.MustCompile(`^\s*(\S+)\s+\-\s\S.*$`)
+	rtx := regexp.MustCompile(`^Use \+feature.*$`)
+	rty := regexp.MustCompile(`^For example.*$`)
+	scanner := bufio.NewScanner(strings.NewReader(string(output)))
+	scanner.Split(bufio.ScanLines)
+	which := ""
+	for scanner.Scan() {
+		verb(3, "llc line is: %s", scanner.Text())
+		lineb := scanner.Bytes()
+		if rw.Find(lineb) != nil || rtx.Find(lineb) != nil ||
+			rty.Find(lineb) != nil {
+			continue
+		}
+		asl := r1.FindSubmatch(lineb)
+		if len(asl) == 2 {
+			which = string(asl[1])
+			continue
+		}
+		bsl := r2.FindSubmatch(lineb)
+		if len(bsl) == 2 {
+			if which == "CPUs" {
+				cpu := string(bsl[1])
+				resultcpus = append(resultcpus, cpu)
+			}
+			continue
+		}
+		warn("unmatched lined in llc output: %s", string(lineb))
+	}
+	if scanner.Err() != nil {
+		fatal("error scanning llc output: %v", scanner.Err())
+	}
+
+	return resultcpus
+}
+
+const pream1 = `// DO NOT EDIT: this file auto-generated by the following command:
+//
+`
+
+const pream2 = `
+typedef struct {
+  const char *cpu;
+  const char *attrs;
+} CpuAttrs;
+
+typedef struct {
+  const char *triple;
+  const CpuAttrs *cpuattrs;
+} TripleCpus;
+
+`
+
+func prolog(bw *bufio.Writer) {
+	bw.WriteString(pream1)
+	bw.WriteString("//   ")
+	for _, arg := range os.Args {
+		bw.WriteString(" ")
+		bw.WriteString(arg)
+	}
+	bw.WriteString("\n//\n")
+	bw.WriteString("// in combination with clang:\n//\n")
+	cmd := exec.Command("clang", "--version")
+	output, cerr := cmd.CombinedOutput()
+	if cerr != nil {
+		warn("clang run failed: %s", output)
+		fatal("err = %v", cerr)
+	}
+	sl := strings.Split(string(output), "\n")
+	bw.WriteString("//  ")
+	bw.WriteString(sl[0])
+	bw.WriteString("\n//\n")
+	bw.WriteString(pream2)
+}
+
+func epilog(bw *bufio.Writer, triples []string) {
+	bw.WriteString("const TripleCpus triples[] = {\n")
+	for k, t := range triples {
+		bw.WriteString(fmt.Sprintf("  { \"%s\", &attrs%d[0] },\n", t, k))
+	}
+	bw.WriteString("  { \"\", nullptr } // sentinel\n")
+	bw.WriteString("};\n")
+}
+
+func perform() {
+
+	// Create tempdir
+	dir, err := ioutil.TempDir("", "CaptureFcnAttrsTempDir")
+	if err != nil {
+		fatal("ioutil.TempDir failed, err=%v", err)
+	}
+	if *noclflag {
+		defer func() { fmt.Printf("preserving temp dir %s\n", dir) }()
+	} else {
+		defer os.RemoveAll(dir)
+	}
+
+	// Emit tempfile
+	tf := filepath.Join(dir, "file.c")
+	verb(1, "temp file is %s", tf)
+	if err := ioutil.WriteFile(tf, []byte(prog), 0666); err != nil {
+		fatal("ioutil.WriteFile failed, err=%v", err)
+	}
+
+	// Open output file
+	var outfile = os.Stdout
+	if len(*outfileflag) > 0 {
+		verb(1, "opening %s", *outfileflag)
+		outfile, err = os.OpenFile(*outfileflag,
+			os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	defaultTriple = collectDefaultTriple()
+	triples := strings.Split(*triplesflag, ",")
+	if len(triples) == 0 {
+		triples = append(triples, defaultTriple)
+	}
+	bw := bufio.NewWriter(outfile)
+	prolog(bw)
+	for k, trip := range triples {
+
+		// CPU selection (either from option or via clang/llc)
+		cpus := genCPUs(trip)
+
+		// Enumerate attributes for the specified CPUs
+		enumerateAttributes(trip, dir, cpus, bw, tf, k)
+	}
+	epilog(bw, triples)
+	if err := bw.Flush(); err != nil {
+		fatal("error writing output: %v", err)
+	}
+	if len(*outfileflag) > 0 {
+		if err := outfile.Close(); err != nil {
+			fatal("error closing output file %s: %v", *outfileflag, err)
+		}
+	}
+}
+
+func main() {
+	log.SetFlags(0)
+	log.SetPrefix("capture-fcn-attributes: ")
+	flag.Parse()
+	verb(1, "in main")
+	if flag.NArg() != 0 {
+		usage("please run without arguments")
+	}
+	perform()
+	verb(1, "leaving main")
+	os.Exit(exitst)
+}