gollvm: implement getg function with inline assembly for arm64

The getg function is called frequently in runtime, and its performance
is very critical, so we turn the call to the getg function into a load
operation to inline it. In order to avoid the thread pointer caching
problem when thread switching happens between two function calls of this
function, we only inline the first one through GoSafeGetg pass. But this
implementation is not feasible on linux arm64, because it is based on the
assumption that the llvm backend's cse optimization of the thread pointer
only happens in a block range, but on linux arm64, this optimization occurs
in the entire function range.
Expanding GoSafeGetg pass to the entire function range is not a good
solution, because there will still be a large number of getg function
calls that are not inlined.
This CL simulates the implementation of getg in the c file through inline
assembly. This not only ensures its correctness, but also ensures that
each getg function call in Go files is inlined.
The disadvantage is that if no thread switching occurs between two getg
function calls, then theoretically the second getg function call can be
optimized, but not in this implementation. In this implementation, all
instruction sequences of getg function will be executed once it is called.

This CL also added a unit test case for this implementation.

Updates golang/go#37295

Change-Id: If9e47b2afeb420a1d0316f2a82602b18bed82477
Reviewed-on: https://go-review.googlesource.com/c/gollvm/+/228737
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/bridge/go-llvm-materialize.cpp b/bridge/go-llvm-materialize.cpp
index da3c09d..63fec77 100644
--- a/bridge/go-llvm-materialize.cpp
+++ b/bridge/go-llvm-materialize.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/IR/InlineAsm.h"
 
 static llvm::cl::opt<bool> DisableInlineGetg("disable-inline-getg",
                                              llvm::cl::desc("Disable inlining getg"),
@@ -1378,6 +1379,56 @@
   }
 }
 
+// makeGetgArm64 uses inline asm to implement the function of
+// runtime.getg used in Go files on linux arm64.
+static llvm::Value *makeGetgArm64(Btype *resType,
+                                  BlockLIRBuilder *builder,
+                                  Llvm_backend *be)
+{
+  std::string asmStr;
+  std::string constr;
+  if (be->module().getPICLevel() > llvm::PICLevel::Level::NotPIC ||
+      be->module().getPIELevel() > llvm::PIELevel::Level::Default ) {
+    // Dynamic link.
+    asmStr += "adrp x0, :tlsdesc:runtime.g\n";
+    asmStr += "ldr  $0, [x0, :tlsdesc_lo12:runtime.g]\n";
+    asmStr += "add  x0, x0, :tlsdesc_lo12:runtime.g\n";
+    asmStr += ".tlsdesccall runtime.g\n";
+    asmStr += "blr  $0\n";
+    asmStr += "mrs  $0, TPIDR_EL0\n";
+    asmStr += "ldr  $0, [$0, x0]\n";
+    // We need to clobber x0 because we have to use it to pass parameters.
+    // We also only need to clobber x0, because the TLS descriptor helper
+    // function only modifies x0
+    constr += "=r,~{x0}";
+    llvm::FunctionType *fnType =
+        llvm::FunctionType::get(resType->type(), llvm::ArrayRef<llvm::Type*>{}, false);
+    llvm::Value *callee = llvm::InlineAsm::get(fnType, llvm::StringRef(asmStr),
+                                               llvm::StringRef(constr), true);
+    std::string callname(be->namegen("asmcall"));
+    return builder->CreateCall(fnType, callee, {}, callname);
+  } else {
+    // Static link.
+    asmStr += "adrp $0, :gottprel:runtime.g\n";
+    asmStr += "ldr  $0, [$0, #:gottprel_lo12:runtime.g]\n";
+    asmStr += "mrs  $1, tpidr_el0\n";
+    asmStr += "ldr  $0, [$1, $0]\n";
+    // In order not to clobber registers, we declare a temporary variable
+    // as the second output and return the first output.
+    constr += "=r,=r";
+    llvm::Type *tempRegType = llvm::IntegerType::get(builder->getContext(), 64);
+    llvm::Type *fnResType = llvm::StructType::create(
+        builder->getContext(), {resType->type(), tempRegType});
+    llvm::FunctionType *fnType =
+        llvm::FunctionType::get(fnResType, llvm::ArrayRef<llvm::Type*>{}, false);
+    llvm::Value *callee = llvm::InlineAsm::get(fnType, llvm::StringRef(asmStr),
+                                               llvm::StringRef(constr), true);
+    std::string callname(be->namegen("asmcall"));
+    llvm::Instruction *calI = builder->CreateCall(fnType, callee, {}, callname);
+    return builder->CreateExtractValue(calI, {0});
+  }
+}
+
 // Inline runtime.getg, generate a load of g.
 // This is not done as a builtin because, unlike other builtins,
 // we need the FE to tell us the result type.
@@ -1394,7 +1445,10 @@
     g = llvm::cast<llvm::GlobalValue>(bv->value());
     g->setThreadLocal(true);
   }
-  return builder->CreateLoad(g);
+  if (be->triple().getArch() == llvm::Triple::aarch64)
+    return makeGetgArm64(resType, builder, be);
+  else
+    return builder->CreateLoad(g);
 }
 
 Bexpression *Llvm_backend::materializeCall(Bexpression *callExpr)
diff --git a/bridge/go-llvm.cpp b/bridge/go-llvm.cpp
index 7dbe0cd..92c0f54 100644
--- a/bridge/go-llvm.cpp
+++ b/bridge/go-llvm.cpp
@@ -44,10 +44,12 @@
                            llvm::Module *module,
                            Llvm_linemap *linemap,
                            unsigned addrspace,
+                           llvm::Triple triple,
                            llvm::CallingConv::ID cconv)
     : TypeManager(context, cconv, addrspace)
     , context_(context)
     , module_(module)
+    , triple_(triple)
     , datalayout_(module ? &module->getDataLayout() : nullptr)
     , nbuilder_(this)
     , linemap_(linemap)
@@ -80,8 +82,20 @@
   // Similarly for the LLVM module (unit testing)
   if (!module_) {
     ownModule_.reset(new llvm::Module("gomodule", context));
-    ownModule_->setTargetTriple("x86_64-unknown-linux-gnu");
-    ownModule_->setDataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128");
+    switch (cconv) {
+      case llvm::CallingConv::X86_64_SysV:
+        ownModule_->setTargetTriple("x86_64-unknown-linux-gnu");
+        ownModule_->setDataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128");
+        triple_ = llvm::Triple("x86_64-unknown-linux-gnu");
+        break;
+      case llvm::CallingConv::ARM_AAPCS:
+        ownModule_->setTargetTriple("aarch64-unknown-linux-gnu");
+        ownModule_->setDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
+        triple_ = llvm::Triple("aarch64-unknown-linux-gnu");
+        break;
+      default:
+        std::cerr <<"Unsupported calling convention\n";
+    }
     module_ = ownModule_.get();
   }
 
@@ -4087,5 +4101,5 @@
 // Return a new backend generator.
 
 Backend *go_get_backend(llvm::LLVMContext &context, llvm::CallingConv::ID cconv) {
-  return new Llvm_backend(context, nullptr, nullptr, 0, cconv);
+  return new Llvm_backend(context, nullptr, nullptr, 0, llvm::Triple(), cconv);
 }
diff --git a/bridge/go-llvm.h b/bridge/go-llvm.h
index ff77d3b..653c58a 100644
--- a/bridge/go-llvm.h
+++ b/bridge/go-llvm.h
@@ -65,6 +65,7 @@
 struct GenCallState;
 
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/ADT/Triple.h"
 
 //
 // LLVM-specific implementation of the Backend class; the code in
@@ -79,6 +80,7 @@
                llvm::Module *module,
                Llvm_linemap *linemap,
                unsigned addrspace,
+               llvm::Triple triple,
                /* Temporarily set the parameter as optional to workaround the unit tests. */
                llvm::CallingConv::ID cconv=llvm::CallingConv::X86_64_SysV);
   ~Llvm_backend();
@@ -328,9 +330,10 @@
 
   Llvm_linemap *linemap() const { return linemap_; }
 
-  // Module and datalayout
+  // Module, datalayout and triple
   llvm::Module &module() { return *module_; }
   const llvm::DataLayout &datalayout() { return *datalayout_; }
+  const llvm::Triple &triple() const { return triple_; }
 
   // Type manager functionality
   TypeManager *typeManager() const;
@@ -731,6 +734,9 @@
   // Data layout info from the module.
   const llvm::DataLayout *datalayout_;
 
+  // The target triple.
+  llvm::Triple triple_;
+
   // Builder for constructing Bexpressions and Bstatements.
   BnodeBuilder nbuilder_;
 
diff --git a/driver/CompileGo.cpp b/driver/CompileGo.cpp
index 59b2f3d..b071aa0 100644
--- a/driver/CompileGo.cpp
+++ b/driver/CompileGo.cpp
@@ -31,7 +31,6 @@
 } }
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/Config/llvm-config.h"
@@ -616,7 +615,7 @@
 
   // Now construct Llvm_backend helper.
   unsigned addrspace = enable_gc_ ? 1 : 0;
-  bridge_.reset(new Llvm_backend(context_, module_.get(), linemap_.get(), addrspace, cconv_));
+  bridge_.reset(new Llvm_backend(context_, module_.get(), linemap_.get(), addrspace, triple_, cconv_));
 
   // Honor inline, tracelevel cmd line options
   llvm::Optional<unsigned> tl =
@@ -949,7 +948,10 @@
       createTargetTransformInfoWrapperPass(target_->getTargetIRAnalysis()));
   createPasses(modulePasses, functionPasses);
 
-  modulePasses.add(createGoSafeGetgPass());
+  // Disable inlining getg in some cases on x86_64.
+  if (triple_.getArch() == llvm::Triple::x86_64) {
+      modulePasses.add(createGoSafeGetgPass());
+  }
 
   // Add statepoint insertion pass to the end of optimization pipeline,
   // right before lowering to machine IR.
diff --git a/unittests/BackendCore/BackendCABIOracleTests.cpp b/unittests/BackendCore/BackendCABIOracleTests.cpp
index dc09b34..98561e8 100644
--- a/unittests/BackendCore/BackendCABIOracleTests.cpp
+++ b/unittests/BackendCore/BackendCABIOracleTests.cpp
@@ -33,7 +33,7 @@
   LLVMContext C;
   auto cc = GetParam();
   std::unique_ptr<Llvm_backend> bep(
-      new Llvm_backend(C, nullptr, nullptr, 0, cc));
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::Triple(), cc));
   Llvm_backend *be = bep.get();
 
   Btype *bi8t = be->integer_type(false, 8);
@@ -72,7 +72,7 @@
 TEST(BackendCABIOracleTests, ExtendedAmd64) {
   LLVMContext C;
   std::unique_ptr<Llvm_backend> bep(
-      new Llvm_backend(C, nullptr, nullptr, 0, llvm::CallingConv::X86_64_SysV));
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::Triple(), llvm::CallingConv::X86_64_SysV));
   Llvm_backend *be = bep.get();
 
   Btype *bi8t = be->integer_type(false, 8);
@@ -256,7 +256,7 @@
 TEST(BackendCABIOracleTests, ExtendedArm64) {
   LLVMContext C;
   std::unique_ptr<Llvm_backend> bep(
-      new Llvm_backend(C, nullptr, nullptr, 0, llvm::CallingConv::ARM_AAPCS));
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::Triple(), llvm::CallingConv::ARM_AAPCS));
   Llvm_backend *be = bep.get();
 
   Btype *bi8t = be->integer_type(false, 8);
diff --git a/unittests/BackendCore/BackendCallTests.cpp b/unittests/BackendCore/BackendCallTests.cpp
index c8f4f21..e20e24c 100644
--- a/unittests/BackendCore/BackendCallTests.cpp
+++ b/unittests/BackendCore/BackendCallTests.cpp
@@ -229,4 +229,80 @@
   EXPECT_TRUE(isOK && "Function does not have expected contents");
 }
 
+// TODO: We should have written a test for static link, but because in static
+// link mode, the return type of getg is a temporary type, such as %"type
+// 0xaaaafc36b690", this value is not fixed, so we can not determine the
+// expected result. However, the processing method of static link and dynamic
+// link is the same, but the instruction is slightly different.
+TEST(BackendCallTests, TestMakeGetgDynamicArm64) {
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS, "foo");
+  Llvm_backend *be = h.be();
+  be->module().setPICLevel(llvm::PICLevel::BigPIC);
+  Bfunction *func = h.func();
+  Location loc;
+
+  // Declare a function "func runtime.getg() *int64", in fact, the prototype of
+  // runtime.getg is "func runtime.getg() *g", but it is difficult to construct
+  // the structure of g, so we use *int64 to simulate.
+  Btype *bi64t = be->integer_type(false, 64);
+  Btype *bpi64t = be->pointer_type(bi64t);
+  Btype *befty1 = mkFuncTyp(be, L_RES, bpi64t, L_END);
+  unsigned fflags =
+      (Backend::function_is_visible | Backend::function_is_declaration);
+  Bfunction *befcn1 =
+      be->function(befty1, "runtime.getg", "runtime.getg", fflags, loc);
+
+  // Declare a function bar with no args and no return.
+  Btype *befty2 = mkFuncTyp(be, L_END);
+  Bfunction *befcn2 = be->function(befty2, "bar", "bar", fflags, loc);
+
+  // x := getg()
+  Bexpression *fn = be->function_code_expression(befcn1, loc);
+  std::vector<Bexpression *> args;
+  Bexpression *call = be->call_expression(func, fn, args, nullptr, loc);
+  Bvariable *x = h.mkLocal("x", bpi64t, call);
+  Bexpression *vex = be->var_expression(x, loc);
+
+  // Create call to bar()
+  fn = be->function_code_expression(befcn2, loc);
+  call = be->call_expression(func, fn, args, nullptr, loc);
+  h.mkExprStmt(call);
+
+  // y := getg()
+  fn = be->function_code_expression(befcn1, loc);
+  call = be->call_expression(func, fn, args, nullptr, loc);
+  Bvariable *y = h.mkLocal("y", bpi64t, call);
+  Bexpression *vey = be->var_expression(y, loc);
+
+  // z := *x + *y, this makes no sense, just to make x and y be used.
+  Bexpression *xpy = be->binary_expression(
+      OPERATOR_PLUS, be->indirect_expression(bi64t, vex, false, loc),
+      be->indirect_expression(bi64t, vey, false, loc), loc);
+  Bvariable *z = h.mkLocal("z", bi64t, xpy);
+  // return z
+  h.mkReturn(be->var_expression(z, loc));
+
+  DECLARE_EXPECTED_OUTPUT(exp, R"RAW_RESULT(
+    %asmcall.0 = call addrspace(0) i64* asm sideeffect "adrp x0, :tlsdesc:runtime.g\0Aldr  $0, [x0, :tlsdesc_lo12:runtime.g]\0Aadd  x0, x0, :tlsdesc_lo12:runtime.g\0A.tlsdesccall runtime.g\0Ablr  $0\0Amrs  $0, TPIDR_EL0\0Aldr  $0, [$0, x0]\0A", "=r,~{x0}"()
+    store i64* %asmcall.0, i64** %x, align 8
+    call addrspace(0) void @bar(i8* nest undef)
+    %asmcall.1 = call addrspace(0) i64* asm sideeffect "adrp x0, :tlsdesc:runtime.g\0Aldr  $0, [x0, :tlsdesc_lo12:runtime.g]\0Aadd  x0, x0, :tlsdesc_lo12:runtime.g\0A.tlsdesccall runtime.g\0Ablr  $0\0Amrs  $0, TPIDR_EL0\0Aldr  $0, [$0, x0]\0A", "=r,~{x0}"()
+    store i64* %asmcall.1, i64** %y, align 8
+    %x.ld.0 = load i64*, i64** %x, align 8
+    %.ld.0 = load i64, i64* %x.ld.0, align 8
+    %y.ld.0 = load i64*, i64** %y, align 8
+    %.ld.1 = load i64, i64* %y.ld.0, align 8
+    %add.0 = add i64 %.ld.0, %.ld.1
+    store i64 %add.0, i64* %z, align 8
+    %z.ld.0 = load i64, i64* %z, align 8
+    ret i64 %z.ld.0
+  )RAW_RESULT");
+
+  bool isOK = h.expectBlock(exp);
+  EXPECT_TRUE(isOK && "Block does not have expected contents");
+
+  bool broken = h.finish(StripDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+}
+
 } // namespace
diff --git a/unittests/BackendCore/BackendTreeIntegrity.cpp b/unittests/BackendCore/BackendTreeIntegrity.cpp
index 0c31828..6c56512 100644
--- a/unittests/BackendCore/BackendTreeIntegrity.cpp
+++ b/unittests/BackendCore/BackendTreeIntegrity.cpp
@@ -78,7 +78,7 @@
   LLVMContext C;
   auto cc = GetParam();
   std::unique_ptr<Llvm_backend> be(
-      new Llvm_backend(C, nullptr, nullptr, 0, cc));
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::Triple(), cc));
   be->disableIntegrityChecks();
 
   Location loc;
@@ -116,7 +116,7 @@
   LLVMContext C;
   auto cc = GetParam();
   std::unique_ptr<Llvm_backend> be(
-      new Llvm_backend(C, nullptr, nullptr, 0, cc));
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::Triple(), cc));
   be->disableIntegrityChecks();
   Location loc;
   Bfunction *func = mkFunci32o64(be.get(), "foo");
diff --git a/unittests/BackendCore/TestUtils.cpp b/unittests/BackendCore/TestUtils.cpp
index ccfadd6..e84e51a 100644
--- a/unittests/BackendCore/TestUtils.cpp
+++ b/unittests/BackendCore/TestUtils.cpp
@@ -386,7 +386,7 @@
 
 FcnTestHarness::FcnTestHarness(llvm::CallingConv::ID cconv, const char *fcnName)
     : context_()
-    , be_(new Llvm_backend(context_, nullptr, nullptr, 0, cconv))
+    , be_(new Llvm_backend(context_, nullptr, nullptr, 0, llvm::Triple(), cconv))
     , func_(nullptr)
     , entryBlock_(nullptr)
     , curBlock_(nullptr)