gollvm: add linux arm64 support

This CL adds linux arm64 support for Gollvm, with this patch we can build and
run Go programs with Gollvm on linux arm64.

The main changes include:
1, ARM_AAPCS64 ABI implementation (some features are not implemented, such
   as HVA, Unions and Bit-fields).
2, port all of the existing unit test cases on amd64 to arm64.

It should be emphasized that since ld.gold does not support stack split on
linux arm64, we must specify the cmake parameter -DGOLLVM_USE_SPLIT_STACK=OFF
when building. Besides, there are a few go tool and standard package test
failures. Next we will fix these failures in the follow up patches.

Change-Id: I020573ee437f182e26c03b444b080d497f4245c3
Reviewed-on: https://go-review.googlesource.com/c/gollvm/+/213998
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/README.md b/README.md
index 8a98ebf..b9881fa 100644
--- a/README.md
+++ b/README.md
@@ -264,7 +264,7 @@
 
 ## Which architectures and operating systems are supported for gollvm? <a name="supported"></a>
 
-Gollvm is currently supported only for x86_64 Linux.
+Gollvm is currently supported only for x86_64 and aarch64 Linux.
 
 ## How does the gollvm runtime differ from the main Go runtime?  <a name="runtimediffs"></a>
 
diff --git a/bridge/go-llvm-bfunction.cpp b/bridge/go-llvm-bfunction.cpp
index 5afdcaa..2dc331b 100644
--- a/bridge/go-llvm-bfunction.cpp
+++ b/bridge/go-llvm-bfunction.cpp
@@ -132,7 +132,8 @@
       case ParmIndirect: {
         paramValues_.push_back(arguments_[argIdx]);
         assert(paramInfo.numArgSlots() == 1);
-        arguments_[argIdx]->addAttr(llvm::Attribute::ByVal);
+        if (paramInfo.attr() == AttrByVal)
+          arguments_[argIdx]->addAttr(llvm::Attribute::ByVal);
         argIdx += 1;
         break;
       }
diff --git a/bridge/go-llvm-cabi-oracle.cpp b/bridge/go-llvm-cabi-oracle.cpp
index b5d870b..45c2406 100644
--- a/bridge/go-llvm-cabi-oracle.cpp
+++ b/bridge/go-llvm-cabi-oracle.cpp
@@ -22,7 +22,16 @@
 // need to be passed in an integer or SSE register (or if it is
 // some combination of entirely empty structs/arrays).
 
-enum TypDisp { FlavSSE, FlavInt, FlavEmpty };
+enum TypDisp { FlavSSE, FlavSIMDFP, FlavInt, FlavEmpty };
+
+// Arm64 ABI AAPCS64 defines HFA as follows:
+// An Homogeneous Floating-point Aggregate (HFA) is an Homogeneous Aggregate with
+// a Fundamental Data Type that is a Floating-Point type and at most four uniquely
+// addressable members.
+struct HFAInfo {
+  unsigned number;
+  llvm::Type *type;
+};
 
 // Here "meet" is in the dataflow anlysis sense (meet operator on lattice
 // values).
@@ -59,6 +68,8 @@
 // elements relative to the start of the object; "abiDirectType" holds
 // the type used to pass the elements if we're passing the entire
 // object directly (in registers) as opposed in on the stack.
+//
+// Implementation of Arm64 ABI AAPCS64 reused this struct.
 
 struct EightByteRegion {
   EightByteRegion() : abiDirectType(nullptr), attr(AttrNone) { }
@@ -111,18 +122,22 @@
   EightByteInfo(Btype *bt, TypeManager *tm);
 
   std::vector<EightByteRegion> &regions() { return ebrs_; }
+  HFAInfo &getHFA() { return hfa_; }
   void getRegisterRequirements(unsigned *numInt, unsigned *numSSE);
 
  private:
   std::vector<EightByteRegion> ebrs_;
+  HFAInfo hfa_;
   TypeManager *typeManager_;
 
   typedef std::pair<Btype *, unsigned> typAndOffset;
   void addLeafTypes(Btype *bt, unsigned off, std::vector<typAndOffset> *leaves);
   void explode(Btype *bt);
+  void setHFA();
   void explodeStruct(Btype *bst);
   void explodeArray(BArrayType *bat);
   void incorporateScalar(Btype *bt);
+  void determineABITypesForARM_AAPCS();
   void determineABITypesForX86_64_SysV();
   TypeManager *tm() const { return typeManager_; }
 };
@@ -136,6 +151,13 @@
   case llvm::CallingConv::X86_64_SysV:
     determineABITypesForX86_64_SysV();
     break;
+  case llvm::CallingConv::ARM_AAPCS:
+    setHFA();
+    if (getHFA().number == 0 && tmgr->typeSize(bt) <= 16) {
+      // For HFA and indirect cases, we don't need do this.
+      determineABITypesForARM_AAPCS();
+    }
+    break;
   default:
     llvm::errs() << "unsupported llvm::CallingConv::ID " << cconv << "\n";
     break;
@@ -199,7 +221,7 @@
   leaves->push_back(std::make_pair(bt, offset));
 }
 
-// Given a struct type, explode it into 0, 1, or two EightByteRegion
+// Given a struct type, explode it into zero, one or multiple EightByteRegion
 // descriptors. Examples of the contents of EightByteInfo structs
 // for various Go types follow. The first type (empty struct) results
 // in a single EightByteRegion struct with empty vectors. The second
@@ -245,7 +267,7 @@
   }
 }
 
-// Given an array type, explode it into 0, 1, or two EightByteInfo
+// Given an array type, explode it into zero, one or multiple EightByteInfo
 // descriptors. Examples appear below; the first array type results
 // in a single EightByteInfo with empty type/offset vectors, then the second
 // array type results in a single EightByteInfo, and the thrd array
@@ -323,6 +345,79 @@
       *numInt += 1;
 }
 
+// Check if the parameter is an Homogeneous Floating-point Aggregates (HFA),
+// and set hfa_ according to the result.
+void EightByteInfo::setHFA() {
+  if (ebrs_.empty()) {
+    hfa_ = HFAInfo{0, nullptr};
+    return;
+  }
+  unsigned num = 0;
+  llvm::Type *typ = ebrs_[0].types[0];
+  if (typ != tm()->llvmDoubleType() && typ != tm()->llvmFloatType()) {
+    hfa_ = HFAInfo{0, nullptr};
+    return;
+  }
+  for (auto &ebr : ebrs_) {
+    for (auto &t : ebr.types) {
+      if (t != typ || num > 3) {
+        hfa_ = HFAInfo{0, nullptr};
+        return;
+      }
+      ++num;
+    }
+  }
+  hfa_ = HFAInfo{num, typ};
+}
+
+// Select the appropriate abi type for each eight-byte region within
+// an EightByteInfo. HFA and arguments larger than 16 bytes have been
+// processed, so the arguments processed here can only be integer types,
+// pointer types or a mix of integer and non-integer, mapped it onto
+// the pointer type or the appropriately sized integer type.
+//
+// Problems arise in the code below when dealing with structures with
+// constructs that inject additional padding. For example, consider
+// the following struct passed by value:
+//
+//      struct {
+//        f1 int8
+//        f2 [0]uint64
+//        f3 int8
+//      }
+//
+// Without taking into account the over-alignment of field f3, we would
+// wind up with two regions, each with type int8. This in itself is not so
+// bad, but creating a struct from these two types (via ::computeABIStructType)
+// would give us { int8, int8 }, in which the second field doesn't have
+// the correct alignment. Work around this by checking for such situations
+// and promoting the type of the first EBR to 64 bits.
+//
+void EightByteInfo::determineABITypesForARM_AAPCS() {
+  assert(ebrs_.size() <= 2);
+  for (auto &ebr : ebrs_) {
+    if (ebr.abiDirectType != nullptr)
+      continue;
+    // Preserve pointerness for the use of GC.
+    // TODO: this assumes pointer is 8 byte, so we never pack pointer
+    // and other stuff together.
+    if (ebr.types[0]->isPointerTy()) {
+      ebr.abiDirectType = tm()->llvmPtrType();
+      continue;
+    }
+    unsigned nel = ebr.offsets.size();
+    unsigned bytes = ebr.offsets[nel - 1] - ebr.offsets[0] +
+                     tm()->llvmTypeSize(ebr.types[nel - 1]);
+    assert(bytes && bytes <= 8);
+    ebr.abiDirectType = tm()->llvmArbitraryIntegerType(bytes);
+  }
+
+  // See the example above for more on why this is needed.
+  if (ebrs_.size() == 2 && ebrs_[0].abiDirectType->isIntegerTy()) {
+    ebrs_[0].abiDirectType = tm()->llvmArbitraryIntegerType(8);
+  }
+}
+
 // Select the appropriate abi type for each eight-byte region within
 // an EightByteInfo. Pure floating point types are mapped onto float,
 // double, or <2 x float> (a vector type), integer types (or something
@@ -427,7 +522,8 @@
            (attr() == AttrByVal ? " AttrByVal" :
             (attr() == AttrNest ? " AttrNest" :
              (attr() == AttrZext ? " AttrZext" :
-              (attr() == AttrSext ? " AttrSext" : " <unknown>")))));
+              (attr() == AttrSext ? " AttrSext" :
+               (attr() == AttrDoCopy ? " AttrDoCopy" : " <unknown>"))))));
   os << " { ";
   unsigned idx = 0;
   for (auto &abit : abiTypes_) {
@@ -454,6 +550,10 @@
       availIntRegs_ = 6;
       availSSERegs_ = 8;
       break;
+    case llvm::CallingConv::ARM_AAPCS:
+      availIntRegs_ = 8;
+      availSIMDFPRegs_ = 8;
+      break;
     default:
       llvm::errs() << "unsupported llvm::CallingConv::ID " << cconv << "\n";
       break;
@@ -469,20 +569,33 @@
       availSSERegs_ -= 1;
     argCount_ += 1;
   }
+  // For ARM_AAPCS HFA, one argument may takes multiple registers.
+  void addDirectSIMDFPArg(unsigned sr = 1) {
+    unsigned t = availSIMDFPRegs_ - sr;
+    if (availSIMDFPRegs_ > t)
+      availSIMDFPRegs_ = t;
+    argCount_ += 1;
+  }
   void addIndirectArg() { argCount_ += 1; }
   void addIndirectReturn() {
     if (availIntRegs_)
       availIntRegs_ -= 1;
     argCount_ += 1;
   }
+  // ARM_AAPCS uses separate x8 to store return address.
+  void addIndirectReturnForARM_AAPCS() { argCount_ += 1; }
   void addChainArg() { argCount_ += 1; }
   unsigned argCount() const { return argCount_; }
   unsigned availIntRegs() const { return availIntRegs_; }
   unsigned availSSERegs() const { return availSSERegs_; }
+  unsigned availSIMDFPRegs() const { return availSIMDFPRegs_; }
+  void clearAvailIntRegs() { availIntRegs_ = 0; }
+  void clearAvailSIMDFPRegs() { availSIMDFPRegs_ = 0; }
 
 private:
   unsigned availIntRegs_;
   unsigned availSSERegs_;
+  unsigned availSIMDFPRegs_;
   unsigned argCount_;
 };
 
@@ -523,7 +636,8 @@
   assert(typeManager_ != nullptr);
   ccID_ = typeManager_->callingConv();
   // Supported architectures at present.
-  assert(ccID_ == llvm::CallingConv::X86_64_SysV);
+  assert(ccID_ == llvm::CallingConv::X86_64_SysV ||
+         ccID_ == llvm::CallingConv::ARM_AAPCS);
 
   if (cc_ != nullptr) {
     return;
@@ -532,6 +646,9 @@
   case llvm::CallingConv::X86_64_SysV:
     cc_ = std::unique_ptr<CABIOracleArgumentAnalyzer>(new CABIOracleX86_64_SysV(typeManager_));
     break;
+  case llvm::CallingConv::ARM_AAPCS:
+    cc_ = std::unique_ptr<CABIOracleArgumentAnalyzer>(new CABIOracleARM_AAPCS(typeManager_));
+    break;
   default:
     llvm::errs() << "unsupported llvm::CallingConv::ID " << ccID_ << "\n";
     break;
@@ -829,3 +946,212 @@
 }
 
 //......................................................................
+
+CABIOracleARM_AAPCS::CABIOracleARM_AAPCS(TypeManager *typeManager)
+    : CABIOracleArgumentAnalyzer(typeManager) {}
+
+// Given the number of registers that we think a param is going to consume, and
+// a state object storing the registers used so far, canPassDirectly() makes a
+// decision as to whether a given param can be passed directly in registers vs
+// in memory.
+//
+// Note the first clause, "if (regsInt + regsSIMDFP == 1) return true". This may
+// seem counter-intuitive (why no check against the state object?), but this way
+// of doing things is the convention used by other front ends (e.g. clang). What
+// is happening here is that for larger aggregate/array params (things that
+// don't fit into a single register), we'll make the pass-through-memory
+// semantics explicit in the function signature and generate the explict code to
+// copy things into memory. For params that do fit into a single register,
+// however, we just leave them all as by-value parameters and then assume that
+// the back end will do the right thing (e.g. pass the first few in registers
+// and then the remaining ones in memory).
+//
+// Doing things this way has performance advantages in that the middle-end
+// (all of the machine-independent LLVM optimization passes) won't have
+// to deal with the additional chunks of stack memory and code to copy
+// things onto and off of the stack (not to mention the aliasing concerns
+// when a local variable's address is taken and then passed in a function
+// call).
+
+bool CABIOracleARM_AAPCS::canPassDirectly(unsigned regsInt,
+                                          unsigned regsSIMDFP,
+                                          ABIState &state)
+{
+  if (regsInt + regsSIMDFP == 1) // see comment above
+    return true;
+  if (regsInt <= state.availIntRegs() && regsSIMDFP <= state.availSIMDFPRegs())
+    return true;
+  return false;
+}
+
+CABIParamInfo CABIOracleARM_AAPCS::analyzeABIParam(Btype *paramType, ABIState &state)
+{
+  llvm::Type *ptyp = paramType->type();
+
+  // The only situations in which we should be seeing AuxT types here is
+  // in cases where we're analyzing the signatures of builtin functions,
+  // meaning that there should be no structures or arrays.
+  assert(paramType->flavor() != Btype::AuxT || ptyp->isVoidTy() ||
+         !(ptyp->isStructTy() || ptyp->isArrayTy() || ptyp->isVectorTy() ||
+           ptyp->isEmptyTy() || ptyp->isIntegerTy(8) || ptyp->isIntegerTy(16)));
+
+  if (ptyp == tm_->llvmVoidType()) {
+    // Empty struct or array
+    llvm::Type *voidType = tm_->llvmVoidType();
+    return CABIParamInfo(voidType, ParmIgnore, AttrNone, -1);
+  }
+
+  // If ptyp is llvmVoidType, we may not able to get the size of it,
+  // so we can't combine the following if statement with the above one.
+  int64_t sz = tm_->typeSize(paramType);
+  if (sz == 0) {
+    // Empty struct or array
+    llvm::Type *voidType = tm_->llvmVoidType();
+    return CABIParamInfo(voidType, ParmIgnore, AttrNone, -1);
+  }
+
+  int sigOff = state.argCount();
+
+  // Go has only two floating point types: float32 and float64, so the size of
+  // an HFA does not exceed 32 bytes.
+  if (sz > 32) {
+    // Value will be passed in memory on stack.
+    // Stack is always in address space 0.
+    llvm::Type *ptrTyp = llvm::PointerType::get(ptyp, 0);
+    state.addIndirectArg();
+    return CABIParamInfo(ptrTyp, ParmIndirect, AttrDoCopy, sigOff);
+  }
+
+  EightByteInfo ebi(paramType, tm_);
+  auto &hfa = ebi.getHFA();
+  if (hfa.number != 0) {
+    // Is HFA.
+    llvm::Type * abiType = hfa.type;
+    if (hfa.number > 1) {
+      // If it contains multiple elements, make the param as an Array
+      // type. This ensures that an HFA is passed as a whole.
+      abiType = llvm::ArrayType::get(hfa.type, hfa.number);
+    }
+    if (canPassDirectly(0, hfa.number, state)) {
+      state.addDirectSIMDFPArg(hfa.number);
+    } else {
+      state.clearAvailSIMDFPRegs();
+      state.addIndirectArg();
+    }
+    // Whether or not an HFA can be passed in registers, we use
+    // ParmDirect. This is because HFA is passed by value on stack
+    // in indirect cases, and we happen to be able to reuse the
+    // processing logic of the direct cases.
+    return CABIParamInfo(abiType, ParmDirect, AttrNone, sigOff);
+  }
+  if (sz > 16) {
+    // Not an HFA,value will be passed in memory on stack.
+    // Stack is always in address space 0.
+    llvm::Type *ptrTyp = llvm::PointerType::get(ptyp, 0);
+    state.addIndirectArg();
+    return CABIParamInfo(ptrTyp, ParmIndirect, AttrDoCopy, sigOff);
+  }
+
+  // Direct case.
+  auto &regions = ebi.regions();
+  // Make direct/indirect decision
+  CABIParamAttr attr = AttrNone;
+  if (canPassDirectly(regions.size(), 0, state)) {
+    std::vector<llvm::Type *> abiTypes;
+    for (auto &ebr : regions) {
+      abiTypes.push_back(ebr.abiDirectType);
+      if (ebr.attr != AttrNone) {
+        assert(attr == AttrNone || attr == ebr.attr);
+        attr = ebr.attr;
+      }
+      state.addDirectIntArg();
+    }
+    return CABIParamInfo(abiTypes, ParmDirect, attr, sigOff);
+  } else {
+    state.clearAvailIntRegs();
+    state.addIndirectArg();
+    llvm::Type *abiType = regions[0].abiDirectType;
+    if (regions.size() > 1) {
+      // Convert the argument to an array type so that the backend considers it as a
+      // whole whether it can be passed through registers.
+      abiType = llvm::ArrayType::get(tm_->llvmArbitraryIntegerType(8), regions.size());
+    }
+    // Pass by value on stack, so use ParmDirect.
+    return CABIParamInfo(abiType, ParmDirect, AttrNone, sigOff);
+  }
+}
+
+CABIParamInfo CABIOracleARM_AAPCS::analyzeABIReturn(Btype *resultType,
+                                                    ABIState &state) {
+  llvm::Type *rtyp = resultType->type();
+
+  if (rtyp == tm_->llvmVoidType()) {
+    // This corresponds to a function with no returns or
+    // returning an empty composite.
+    llvm::Type *voidType = tm_->llvmVoidType();
+    return CABIParamInfo(voidType, ParmIgnore, AttrNone, -1);
+  }
+
+  // If rtyp is llvmVoidType, we may not able to get the size of it,
+  // so we can't combine the following if statement with the above one.
+  int64_t sz = tm_->typeSize(resultType);
+  if (sz == 0) {
+    // This corresponds to a function with no returns or
+    // returning an empty composite.
+    llvm::Type *voidType = tm_->llvmVoidType();
+    return CABIParamInfo(voidType, ParmIgnore, AttrNone, -1);
+  }
+
+  // Go has only two floating point types: float32 and float64, so the size of
+  // an HFA does not exceed 32 bytes.
+  if (sz > 32) {
+    // Return value will be passed in memory, via a hidden
+    // struct return param.
+    // It is on stack, therefore address space 0.
+    llvm::Type *ptrTyp = llvm::PointerType::get(rtyp, 0);
+    // Indirect return value is passed by register R8, so doesn't occupy any int
+    // register.
+    state.addIndirectReturnForARM_AAPCS();
+    return CABIParamInfo(ptrTyp, ParmIndirect, AttrStructReturn, 0);
+  }
+
+  EightByteInfo ebi(resultType, tm_);
+  auto &hfa = ebi.getHFA();
+  if (hfa.number != 0) {
+    // Is HFA.
+    // If only one element, don't bother to make a llvm struct type.
+    if (hfa.number == 1) {
+      return CABIParamInfo(hfa.type, ParmDirect, AttrNone, -1);
+    }
+    std::vector<llvm::Type *> fields;
+    for (unsigned i = 0; i < hfa.number; ++i) {
+      fields.push_back(hfa.type);
+    }
+    llvm::Type *abiTyp = tm_->makeLLVMStructType(fields);
+    return CABIParamInfo(abiTyp, ParmDirect, AttrNone, -1);
+  }
+
+  // The return value is not an HFA and its size exceeds 16 bytes,
+  // be passed in memory, via a hidden struct return param.
+  if (sz > 16) {
+    llvm::Type *ptrTyp = llvm::PointerType::get(rtyp, 0);
+    state.addIndirectReturnForARM_AAPCS();
+    return CABIParamInfo(ptrTyp, ParmIndirect, AttrStructReturn, 0);
+  }
+
+  // Direct case
+  auto &regions = ebi.regions();
+  if (regions.size() == 1) {
+    // Single value
+    return CABIParamInfo(regions[0].abiDirectType, ParmDirect, regions[0].attr,
+                         -1);
+  }
+
+  // Two-element struct
+  assert(regions.size() == 2);
+  llvm::Type *abiTyp = tm_->makeLLVMTwoElementStructType(
+      regions[0].abiDirectType, regions[1].abiDirectType);
+  return CABIParamInfo(abiTyp, ParmDirect, AttrNone, -1);
+}
+
+//......................................................................
diff --git a/bridge/go-llvm-cabi-oracle.h b/bridge/go-llvm-cabi-oracle.h
index f146522..c186c38 100644
--- a/bridge/go-llvm-cabi-oracle.h
+++ b/bridge/go-llvm-cabi-oracle.h
@@ -12,8 +12,8 @@
 //
 // There are many possible complications, permutations, and oddities when
 // it comes to runtime calling conventions; the code here currently supports
-// only x86_64 SysV, which gets rid of many of the corner cases that can
-// be found in the corresponding code in Clang.
+// only x86_64 SysV and ARM AAPCS64, which gets rid of many of the corner
+// cases that can be found in the corresponding code in Clang.
 //
 //===----------------------------------------------------------------------===//
 
@@ -49,16 +49,19 @@
 
 };
 
-// Attributes on parameters. These correspond directly to the LLVM attrs
-// of the same name.
+// Attributes on parameters. Most of them correspond directly to the
+// LLVM attrs of the same name.
 
 enum CABIParamAttr : uint8_t {
-  AttrNone=0,
+  AttrNone = 0,
   AttrStructReturn,
   AttrByVal,
   AttrNest,
   AttrZext,
   AttrSext,
+  // For indirect parameter, do a copy of the parameter on stack and
+  // pass the address of the copy to callee.
+  AttrDoCopy,
 };
 
 // Container class for storing info on how a specific parameter is
@@ -133,8 +136,11 @@
   // This constant specifies the maximum possible size of vectors abiTypes_
   // in the direct parameter passing case.
   // For X86_64_SysV, the size of paramInfo.abiTypes() can't be larger than 2,
-  // because parameters that are larger than 16 bytes are passed indirectly.
-  static const unsigned int ABI_TYPES_MAX_SIZE = 2;
+  // because parameters that are larger than 16 bytes are passed indirectly. For
+  // ARM_AAPCS, as a HFA can have 4 elements, so the size can be as large as 4.
+  // Currently we simply set this value to the maximum value of the supported
+  // platforms.
+  static const unsigned int ABI_TYPES_MAX_SIZE = 4;
 
  private:
   std::vector<llvm::Type *> abiTypes_;
@@ -228,4 +234,18 @@
   CABIParamDisp classifyArgType(Btype *btype);
 };
 
+// This class implements ARM AAPCS64 calling convention.
+class CABIOracleARM_AAPCS : public CABIOracleArgumentAnalyzer {
+ public:
+  // Given information on the param types and result type for a
+  // function, create an oracle object that can answer C ABI
+  // queries about the function.
+  CABIOracleARM_AAPCS(TypeManager *typeManager);
+  CABIParamInfo analyzeABIParam(Btype *pType, ABIState &state);
+  CABIParamInfo analyzeABIReturn(Btype *resultType, ABIState &state);
+
+ private:
+  bool canPassDirectly(unsigned regsInt, unsigned regsSSE, ABIState &state);
+};
+
 #endif // LLVMGOFRONTEND_GO_LLVM_CABI_ORACLE_H
diff --git a/bridge/go-llvm-materialize.cpp b/bridge/go-llvm-materialize.cpp
index 9147ab6..5ce47bf 100644
--- a/bridge/go-llvm-materialize.cpp
+++ b/bridge/go-llvm-materialize.cpp
@@ -1194,6 +1194,24 @@
         llvm::Type *pt = llvm::PointerType::get(vt->getPointerElementType(), 0);
         val = builder.CreateAddrSpaceCast(val, pt, castname);
       }
+
+      // For some architectures, such as arm64, the indirect parameter needs to
+      // be copied to the space allocated by the caller on the stack, and pass
+      // the address of the copied version to the callee.
+      if (paramInfo.attr() == AttrDoCopy) {
+        BlockLIRBuilder bbuilder(state.callerFcn->function(), this);
+        TypeManager *tm = state.oracle.tm();
+        Btype *bty = fnarg->btype();
+        uint64_t sz = tm->typeSize(bty);
+        uint64_t algn = tm->typeAlignment(bty);
+        std::string tname(namegen("doCopy.addr"));
+        llvm::Value *tmpV = state.callerFcn->createTemporary(bty, tname);
+        bbuilder.CreateMemCpy(tmpV, algn, val, algn, sz);
+        std::vector<llvm::Instruction *> instructions = bbuilder.instructions();
+        for (auto i : instructions)
+          state.instructions.appendInstruction(i);
+        val = tmpV;
+      }
       state.llargs.push_back(val);
       continue;
     }
diff --git a/cmake/modules/ConfigSetup.cmake b/cmake/modules/ConfigSetup.cmake
index f0a0e41..0c220e2 100644
--- a/cmake/modules/ConfigSetup.cmake
+++ b/cmake/modules/ConfigSetup.cmake
@@ -141,5 +141,5 @@
 endif()
 set(USE_LIBFFI 1)
 
-# _Unwind_GetIPInfo is defined on Linux/AMD46.
+# _Unwind_GetIPInfo is defined on Linux/AMD64.
 set(HAVE_GETIPINFO 1)
diff --git a/driver/CompileGo.cpp b/driver/CompileGo.cpp
index 6875878..2dd64c0 100644
--- a/driver/CompileGo.cpp
+++ b/driver/CompileGo.cpp
@@ -634,10 +634,15 @@
                                   true);
   bridge_->setNoFpElim(!omitFp);
 
+  bool supportSplitStack = true;
+#ifndef USING_SPLIT_STACK
+  supportSplitStack = false;
+#endif
+
   bool useSplitStack =
       driver_.reconcileOptionPair(gollvm::options::OPT_fsplit_stack,
                                   gollvm::options::OPT_fno_split_stack,
-                                  true);
+                                  supportSplitStack);
   bridge_->setUseSplitStack(useSplitStack);
 
   // Honor -fdebug-prefix=... option.
@@ -810,6 +815,9 @@
     case Triple::x86_64:
       cconv_ = CallingConv::X86_64_SysV;
       break;
+    case Triple::aarch64:
+      cconv_ = CallingConv::ARM_AAPCS;
+      break;
     default:
       errs() << "currently Gollvm is not supported on architecture "
              << triple_.getArchName().str()<< "\n";
diff --git a/driver/GccUtils.cpp b/driver/GccUtils.cpp
index 0815ca3..78b88f3 100644
--- a/driver/GccUtils.cpp
+++ b/driver/GccUtils.cpp
@@ -180,7 +180,9 @@
       // more triples to be identified and added
       s.tripleAliases = {
         triple_.str(),
-        "aarch64-linux-gnu", "aarch64-unknown-linux-gnu"
+        "aarch64-linux-gnu", "aarch64-unknown-linux-gnu",
+        "aarch64-pc-linux-gnu", "aarch64-redhat-linux",
+        "aarch64-suse-linux"
       };
       // multilib is not supported on major aarch64/arm64 linux distributions
       // subject to change when more scenarios to be taken into account
diff --git a/unittests/BackendCore/BackendArrayStruct.cpp b/unittests/BackendCore/BackendArrayStruct.cpp
index 1b5cc6b..f38882b 100644
--- a/unittests/BackendCore/BackendArrayStruct.cpp
+++ b/unittests/BackendCore/BackendArrayStruct.cpp
@@ -22,7 +22,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendArrayStructTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendArrayStructTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendCABIOracleTests.cpp b/unittests/BackendCore/BackendCABIOracleTests.cpp
index fc6ef51..f43f8a8 100644
--- a/unittests/BackendCore/BackendCABIOracleTests.cpp
+++ b/unittests/BackendCore/BackendCABIOracleTests.cpp
@@ -23,7 +23,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendCABIOracleTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendCABIOracleTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
@@ -253,6 +253,191 @@
   }
 }
 
+TEST(BackendCABIOracleTests, ExtendedArm64) {
+  LLVMContext C;
+  std::unique_ptr<Llvm_backend> bep(
+      new Llvm_backend(C, nullptr, nullptr, 0, llvm::CallingConv::ARM_AAPCS));
+  Llvm_backend *be = bep.get();
+
+  Btype *bi8t = be->integer_type(false, 8);
+  Btype *bu8t = be->integer_type(true, 8);
+  Btype *bu64t = be->integer_type(true, 64);
+  Btype *bu32t = be->integer_type(true, 32);
+  Btype *bi16t = be->integer_type(false, 16);
+  Btype *bf32t = be->float_type(32);
+  Btype *bf64t = be->float_type(64);
+  Btype *bpu64t = be->pointer_type(bu64t);
+  Btype *bpf64t = be->pointer_type(bf64t);
+  Btype *st0 = mkBackendStruct(be, nullptr);
+  Btype *st1 = mkBackendStruct(be, bi8t, "a", bu8t, "b", bf32t, "c", nullptr);
+  Btype *st2 = mkBackendStruct(be, bf64t, "f1", bf64t, "f2", nullptr);
+  Btype *st3 = mkBackendStruct(be, st2, "f1", bi8t, "f2", nullptr);
+  Btype *st4 = mkBackendStruct(be, bf32t, "f1", bf32t, "f2", nullptr);
+  Btype *st5 = mkBackendStruct(be, bf32t, "f1", nullptr);
+  Btype *st6 = mkBackendStruct(be, bf32t, "f1", bi8t, "a", bu8t, "b", bu64t,
+                               "c", nullptr);
+  Btype *st7 = mkBackendStruct(be, bf32t, "f1", bu32t, "f2", nullptr);
+  Btype *st8 = mkBackendStruct(be, bi8t, "f1", bi16t, "f2", st7, "f3", nullptr);
+  Btype *stii = mkBackendStruct(be, bu64t, "a", bu64t, "b", nullptr);
+  Btype *stip = mkBackendStruct(be, bu64t, "a", bpu64t, "b", nullptr);
+  Btype *stpi = mkBackendStruct(be, bpu64t, "a", bu64t, "b", nullptr);
+  Btype *stpp = mkBackendStruct(be, bpu64t, "a", bpu64t, "b", nullptr);
+  Btype *at0 = be->array_type(bu32t, mkInt64Const(be, int64_t(0)));
+  Btype *at1 = be->array_type(bu32t, mkInt64Const(be, int64_t(1)));
+  Btype *at2 = be->array_type(bu32t, mkInt64Const(be, int64_t(3)));
+  Btype *at3 = be->array_type(bu8t, mkInt64Const(be, int64_t(16)));
+
+  struct FcnItem {
+    FcnItem(const std::vector<Btype *> &r, const std::vector<Btype *> &p,
+            const char *d, const char *t)
+        : results(r), parms(p), expDump(d), expTyp(t) {}
+    std::vector<Btype *> results;
+    std::vector<Btype *> parms;
+    const char *expDump;
+    const char *expTyp;
+  };
+
+  Btype *nt = nullptr;
+  std::vector<FcnItem> items = {
+
+      // 1
+      FcnItem({}, {},
+              "Return: Ignore { void } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0",
+              "void (i8*)"),
+
+      // 2
+      FcnItem({bi8t}, {},
+              "Return: Direct AttrSext { i8 } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0",
+              "i8 (i8*)"),
+
+      // 3
+      FcnItem({}, {bi8t},
+              "Return: Ignore { void } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct AttrSext { i8 } sigOffset: 1",
+              "void (i8*, i8)"),
+
+      // 4
+      FcnItem({}, {st5, bpf64t},
+              "Return: Ignore { void } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { float } sigOffset: 1 "
+              "Param 3: Direct { double* } sigOffset: 2",
+              "void (i8*, float, double*)"),
+
+      // 5
+      FcnItem({bi8t, bf64t}, {bi8t, bu8t, st0},
+              "Return: Direct { { i64, i64 } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct AttrSext { i8 } sigOffset: 1 "
+              "Param 3: Direct AttrZext { i8 } sigOffset: 2 "
+              "Param 4: Ignore { void } sigOffset: -1",
+              "{ i64, i64 } (i8*, i8, i8)"),
+
+      // 6
+      FcnItem({st2}, {st2, st0, st4, st1},
+              "Return: Direct { { double, double } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { [2 x double] } sigOffset: 1 "
+              "Param 3: Ignore { void } sigOffset: -1 "
+              "Param 4: Direct { [2 x float] } sigOffset: 2 "
+              "Param 5: Direct { i64 } sigOffset:  3",
+              "{ double, double } (i8*, [2 x double], [2 x float], i64)"),
+
+      // 7
+      FcnItem({st3}, {st3, st0, bu8t},
+              "Return: Indirect AttrStructReturn { { { double, double }, i8 "
+              "}* } sigOffset: 0 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 1 "
+              "Param 2: Indirect AttrDoCopy { { { double, double }, i8 }* } "
+              "sigOffset: 2 "
+              "Param 3: Ignore { void } sigOffset: -1 "
+              "Param 4: Direct AttrZext { i8 } sigOffset: 3 ",
+              "void ({ { double, double }, i8 }*, i8*, "
+              "{ { double, double }, i8 }*, i8)"),
+
+      // 8
+      FcnItem({st6}, {st6, st6},
+              "Return: Direct { { i64, i64 } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { i64, i64 } sigOffset: 1 "
+              "Param 3: Direct { i64, i64 } sigOffset: 3",
+              "{ i64, i64 } (i8*, i64, i64, i64, i64)"),
+
+      // 9
+      FcnItem({st8}, {st8},
+              "Return: Direct { { i64, i32 } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { i64, i32 } sigOffset: 1",
+              "{ i64, i32 } (i8*, i64, i32)"),
+
+      // 10
+      FcnItem({at0}, {at1},
+              "Return: Ignore { void } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { i32 } sigOffset: 1",
+              "void (i8*, i32)"),
+
+      // 11
+      FcnItem({at2}, {at3},
+              "Return: Direct { { i64, i32 } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { i64, i64 } sigOffset: 1",
+              "{ i64, i32 } (i8*, i64, i64)"),
+
+      // 12
+      // Make sure pointerness is preserved.
+      FcnItem({stip}, {stii, stpp, stpi},
+              "Return: Direct { { i64, i8* } } sigOffset: -1 "
+              "Param 1: Direct AttrNest { i8* } sigOffset: 0 "
+              "Param 2: Direct { i64, i64 } sigOffset: 1 "
+              "Param 3: Direct { i8*, i8* } sigOffset: 3 "
+              "Param 4: Direct { i8*, i64 } sigOffset: 5",
+              "{ i64, i8* } (i8*, i64, i64, i8*, i8*, i8*, i64)"),
+  };
+
+  unsigned count = 1;
+  for (auto &item : items) {
+    std::vector<Backend::Btyped_identifier> results;
+    std::vector<Backend::Btyped_identifier> params;
+    for (auto &r : item.results)
+      results.push_back(mkid(r));
+    for (auto &p : item.parms)
+      params.push_back(mkid(p));
+    Btype *rt = nullptr;
+    if (results.size() > 1)
+      rt = be->struct_type(results);
+    Btype *t = be->function_type(mkid(nt), params, results, rt, Location());
+    BFunctionType *bft = t->castToBFunctionType();
+    CABIOracle cab(bft, be->typeManager());
+
+    {
+      std::string reason;
+      bool equal = difftokens(item.expDump, cab.toString(), reason);
+      EXPECT_EQ("pass", equal ? "pass" : reason);
+      if (!equal) {
+        std::cerr << "count: " << count << "\n";
+        std::cerr << "exp:\n" << item.expDump << "\n";
+        std::cerr << "act:\n" << cab.toString() << "\n";
+      }
+    }
+    {
+      std::string reason;
+      std::string result(repr(cab.getFunctionTypeForABI()));
+      bool equal = difftokens(item.expTyp, result, reason);
+      EXPECT_EQ("pass", equal ? "pass" : reason);
+      if (!equal) {
+        std::cerr << "count: " << count << "\n";
+        std::cerr << "exp:\n" << item.expTyp << "\n";
+        std::cerr << "act:\n" << result << "\n";
+      }
+    }
+    count++;
+  }
+}
+
 TEST(BackendCABIOracleTests, RecursiveCall1Amd64) {
   FcnTestHarness h(llvm::CallingConv::X86_64_SysV);
   Llvm_backend *be = h.be();
@@ -378,6 +563,128 @@
   EXPECT_FALSE(broken && "Module failed to verify.");
 }
 
+TEST(BackendCABIOracleTests, RecursiveCall1Arm64) {
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS);
+  Llvm_backend *be = h.be();
+
+  // type s1 struct {
+  //   f1, f2 float32
+  //   i1, i2, i3 int16
+  // }
+  // type s2 struct {
+  //   k float64
+  //   f1, f2 float32
+  // }
+  // type s3 struct {
+  //   f1, s1
+  //   f2, s2
+  // }
+  // type s4 struct {
+  // }
+  // func foo(x s1, y s2, z s4, sm1 uint8, sm2 int8, w s3) s2 {
+  //   if (sm1 == 0) {
+  //     return y
+  //   }
+  //   return foo(x, y, z, sm1-1, sm2, w)
+  // }
+  //
+
+  // Create struct types
+  Btype *bf32t = be->float_type(32);
+  Btype *bf64t = be->float_type(64);
+  Btype *bi16t = be->integer_type(false, 16);
+  Btype *bi8t = be->integer_type(false, 8);
+  Btype *bu8t = be->integer_type(true, 8);
+  Btype *s1 = mkBackendStruct(be, bf32t, "f1", bf32t, "f2", bi16t, "i1", bi16t,
+                              "i2", bi16t, "i3", nullptr);
+  Btype *s2 =
+      mkBackendStruct(be, bf64t, "k", bf32t, "f1", bf32t, "f2", nullptr);
+  Btype *s3 = mkBackendStruct(be, s1, "f1", s2, "f2", nullptr);
+  Btype *s4 = mkBackendStruct(be, nullptr);
+
+  // Create function type
+  BFunctionType *befty1 =
+      mkFuncTyp(be, L_PARM, s1, L_PARM, s2, L_PARM, s4, L_PARM, bu8t, L_PARM,
+                bi8t, L_PARM, s3, L_RES, s2, L_END);
+  Bfunction *func = h.mkFunction("foo", befty1);
+
+  // sm1 == 0
+  Bvariable *p3 = func->getNthParamVar(3);
+  Location loc;
+  Bexpression *vex = be->var_expression(p3, loc);
+  Bexpression *c0 = be->convert_expression(bu8t, mkInt32Const(be, 0), loc);
+  Bexpression *eq = be->binary_expression(OPERATOR_EQEQ, vex, c0, loc);
+
+  // call
+  Bexpression *fn = be->function_code_expression(func, loc);
+  std::vector<Bexpression *> args;
+  Bvariable *p0 = func->getNthParamVar(0);
+  args.push_back(be->var_expression(p0, loc));
+
+  Bvariable *p1 = func->getNthParamVar(1);
+  args.push_back(be->var_expression(p1, loc));
+
+  Bvariable *p2 = func->getNthParamVar(2);
+  args.push_back(be->var_expression(p2, loc));
+
+  Bvariable *p3x = func->getNthParamVar(3);
+  Bexpression *vex3 = be->var_expression(p3x, loc);
+  Bexpression *c1 = be->convert_expression(bu8t, mkInt32Const(be, 1), loc);
+  Bexpression *minus = be->binary_expression(OPERATOR_MINUS, vex3, c1, loc);
+  args.push_back(minus);
+
+  Bvariable *p4 = func->getNthParamVar(4);
+  args.push_back(be->var_expression(p4, loc));
+
+  Bvariable *p5 = func->getNthParamVar(5);
+  args.push_back(be->var_expression(p5, loc));
+  Bexpression *call = be->call_expression(func, fn, args, nullptr, h.loc());
+
+  // return y
+  std::vector<Bexpression *> rvals1;
+  rvals1.push_back(be->var_expression(p1, loc));
+  Bstatement *rst1 = h.mkReturn(rvals1, FcnTestHarness::NoAppend);
+
+  // return call
+  std::vector<Bexpression *> rvals2;
+  rvals2.push_back(call);
+  Bstatement *rst2 = h.mkReturn(rvals2, FcnTestHarness::NoAppend);
+
+  const char *exp = R"RAW_RESULT(
+    %p3.ld.0 = load i8, i8* %p3.addr
+    %sub.0 = sub i8 %p3.ld.0, 1
+    %p4.ld.0 = load i8, i8* %p4.addr
+    %cast.1 = bitcast { float, float, i16, i16, i16 }* %p0.addr to { i64, i48 }*
+    %field0.0 = getelementptr inbounds { i64, i48 }, { i64, i48 }* %cast.1, i32 0, i32 0
+    %ld.1 = load i64, i64* %field0.0
+    %field1.0 = getelementptr inbounds { i64, i48 }, { i64, i48 }* %cast.1, i32 0, i32 1
+    %ld.2 = load i48, i48* %field1.0
+    %cast.2 = bitcast { double, float, float }* %p1.addr to { i64, i64 }*
+    %field0.1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %cast.2, i32 0, i32 0
+    %ld.3 = load i64, i64* %field0.1
+    %field1.1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %cast.2, i32 0, i32 1
+    %ld.4 = load i64, i64* %field1.1
+    %cast.3 = bitcast { { float, float, i16, i16, i16 }, { double, float, float } }* %doCopy.addr.0 to i8*
+    %cast.4 = bitcast { { float, float, i16, i16, i16 }, { double, float, float } }* %p5 to i8*
+    call addrspace(0) void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %cast.3, i8* align 8 %cast.4, i64 32, i1 false)
+    %call.0 = call addrspace(0) { i64, i64 } @foo(i8* nest undef, i64 %ld.1, i48 %ld.2, i64 %ld.3, i64 %ld.4, i8 zeroext %sub.0, i8 signext %p4.ld.0, { { float, float, i16, i16, i16 }, { double, float, float } }* %doCopy.addr.0)
+    %cast.5 = bitcast { double, float, float }* %sret.actual.0 to { i64, i64 }*
+    store { i64, i64 } %call.0, { i64, i64 }* %cast.5
+    %cast.6 = bitcast { double, float, float }* %sret.actual.0 to { i64, i64 }*
+    %ld.5 = load { i64, i64 }, { i64, i64 }* %cast.6
+    ret { i64, i64 } %ld.5
+  )RAW_RESULT";
+
+  bool isOK = h.expectStmt(rst2, exp);
+  EXPECT_TRUE(isOK && "Statement does not have expected contents");
+
+  // if statement
+  h.mkIf(eq, rst1, rst2);
+
+  bool broken = h.finish(PreserveDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+}
+
 TEST(BackendCABIOracleTests, PassAndReturnArraysAmd64) {
   FcnTestHarness h(llvm::CallingConv::X86_64_SysV);
   Llvm_backend *be = h.be();
@@ -425,6 +732,50 @@
   EXPECT_FALSE(broken && "Module failed to verify.");
 }
 
+TEST(BackendCABIOracleTests, PassAndReturnArraysArm64) {
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS);
+  Llvm_backend *be = h.be();
+
+  Btype *bf32t = be->float_type(32);
+  Btype *bf64t = be->float_type(64);
+  Btype *at2f = be->array_type(bf32t, mkInt64Const(be, int64_t(2)));
+  Btype *at3d = be->array_type(bf64t, mkInt64Const(be, int64_t(3)));
+
+  // func foo(fp [2]float32) [3]float64
+  BFunctionType *befty1 = mkFuncTyp(be, L_PARM, at2f, L_RES, at3d, L_END);
+  Bfunction *func = h.mkFunction("foo", befty1);
+
+  // foo(fp)
+  Location loc;
+  Bvariable *p0 = func->getNthParamVar(0);
+  Bexpression *vex = be->var_expression(p0, loc);
+  Bexpression *fn = be->function_code_expression(func, loc);
+  std::vector<Bexpression *> args;
+  args.push_back(vex);
+  Bexpression *call = be->call_expression(func, fn, args, nullptr, h.loc());
+
+  // return foo(fp)
+  std::vector<Bexpression *> rvals;
+  rvals.push_back(call);
+  h.mkReturn(rvals);
+
+  const char *exp = R"RAW_RESULT(
+    %ld.0 = load [2 x float], [2 x float]* %p0.addr
+    %call.0 = call addrspace(0) { double, double, double } @foo(i8* nest undef, [2 x float] %ld.0)
+    %cast.1 = bitcast [3 x double]* %sret.actual.0 to { double, double, double }*
+    store { double, double, double } %call.0, { double, double, double }* %cast.1
+    %cast.2 = bitcast [3 x double]* %sret.actual.0 to { double, double, double }*
+    %ld.1 = load { double, double, double }, { double, double, double }* %cast.2
+    ret { double, double, double } %ld.1
+  )RAW_RESULT";
+
+  bool isOK = h.expectBlock(exp);
+  EXPECT_TRUE(isOK && "Block does not have expected contents");
+
+  bool broken = h.finish(PreserveDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+}
+
 TEST_P(BackendCABIOracleTests, EmptyStructParamsAndReturns) {
   auto cc = GetParam();
   FcnTestHarness h(cc);
@@ -574,4 +925,71 @@
   EXPECT_FALSE(broken && "Module failed to verify.");
 }
 
+TEST(BackendCABIOracleTests, PassAndReturnComplexArm64) {
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS);
+  Llvm_backend *be = h.be();
+
+  Btype *bc64t = be->complex_type(64);
+  Btype *bc128t = be->complex_type(128);
+
+  // func foo(x complex64, y complex128) complex64
+  BFunctionType *befty1 =
+      mkFuncTyp(be, L_PARM, bc64t, L_PARM, bc128t, L_RES, bc64t, L_END);
+  Bfunction *func = h.mkFunction("foo", befty1);
+
+  // z = foo(x, y)
+  Location loc;
+  Bvariable *x = func->getNthParamVar(0);
+  Bvariable *y = func->getNthParamVar(1);
+  Bexpression *xvex = be->var_expression(x, loc);
+  Bexpression *yvex = be->var_expression(y, loc);
+  Bexpression *fn1 = be->function_code_expression(func, loc);
+  std::vector<Bexpression *> args1 = {xvex, yvex};
+  Bexpression *call1 = be->call_expression(func, fn1, args1, nullptr, h.loc());
+  h.mkLocal("z", bc64t, call1);
+
+  // Call with constant args
+  // foo(1+2i, 3+4i)
+  mpc_t mpc_val1, mpc_val2;
+  mpc_init2(mpc_val1, 256);
+  mpc_set_d_d(mpc_val1, 1.0, 2.0, GMP_RNDN);
+  mpc_init2(mpc_val2, 256);
+  mpc_set_d_d(mpc_val2, 3.0, 4.0, GMP_RNDN);
+  Bexpression *ccon1 = be->complex_constant_expression(bc64t, mpc_val1);
+  Bexpression *ccon2 = be->complex_constant_expression(bc128t, mpc_val2);
+  mpc_clear(mpc_val1);
+  mpc_clear(mpc_val2);
+  Bexpression *fn2 = be->function_code_expression(func, loc);
+  std::vector<Bexpression *> args2 = {ccon1, ccon2};
+  Bexpression *call2 = be->call_expression(func, fn2, args2, nullptr, h.loc());
+
+  // return the call expr above
+  std::vector<Bexpression *> rvals = {call2};
+  h.mkReturn(rvals);
+
+  const char *exp = R"RAW_RESULT(
+    %cast.0 = bitcast { float, float }* %p0.addr to [2 x float]*
+    %ld.0 = load [2 x float], [2 x float]* %cast.0
+    %cast.1 = bitcast { double, double }* %p1.addr to [2 x double]*
+    %ld.1 = load [2 x double], [2 x double]* %cast.1
+    %call.0 = call addrspace(0) { float, float } @foo(i8* nest undef, [2 x float] %ld.0, [2 x double] %ld.1)
+    store { float, float } %call.0, { float, float }* %sret.actual.0
+    %cast.3 = bitcast { float, float }* %z to i8*
+    %cast.4 = bitcast { float, float }* %sret.actual.0 to i8*
+    call addrspace(0) void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %cast.3, i8* align 4 %cast.4, i64 8, i1 false)
+    %ld.2 = load [2 x float], [2 x float]* bitcast ({ float, float }* @const.0 to [2 x float]*)
+    %ld.3 = load [2 x double], [2 x double]* bitcast ({ double, double }* @const.1 to [2 x double]*)
+    %call.1 = call addrspace(0) { float, float } @foo(i8* nest undef, [2 x float] %ld.2, [2 x double] %ld.3)
+    store { float, float } %call.1, { float, float }* %sret.actual.1
+    %ld.4 = load { float, float }, { float, float }* %sret.actual.1
+    ret { float, float } %ld.4
+  )RAW_RESULT";
+
+  bool isOK = h.expectBlock(exp);
+  EXPECT_TRUE(isOK && "Block does not have expected contents");
+
+  bool broken = h.finish(PreserveDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+}
+
 } // namespace
diff --git a/unittests/BackendCore/BackendCallTests.cpp b/unittests/BackendCore/BackendCallTests.cpp
index 9ed5d93..61fdb73 100644
--- a/unittests/BackendCore/BackendCallTests.cpp
+++ b/unittests/BackendCore/BackendCallTests.cpp
@@ -22,7 +22,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendCallTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendCallTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendCoreTests.cpp b/unittests/BackendCore/BackendCoreTests.cpp
index 7773a26..ba7d97a 100644
--- a/unittests/BackendCore/BackendCoreTests.cpp
+++ b/unittests/BackendCore/BackendCoreTests.cpp
@@ -20,7 +20,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendCoreTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendCoreTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendDebugEmit.cpp b/unittests/BackendCore/BackendDebugEmit.cpp
index 03dfdd7..b54cb05 100644
--- a/unittests/BackendCore/BackendDebugEmit.cpp
+++ b/unittests/BackendCore/BackendDebugEmit.cpp
@@ -25,7 +25,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendDebugEmit,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendDebugEmit::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
@@ -89,6 +89,34 @@
   EXPECT_TRUE(isOK && "Function does not have expected contents");
 }
 
+TEST(BackendDebugEmit, TestSimpleDecl2Arm64) {
+  // Test that parameters of empty function are handled correctly.
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS);
+  Llvm_backend *be = h.be();
+  Btype *bi64t = be->integer_type(false, 64);
+  Btype *bst = mkBackendStruct(be, bi64t, "f1", bi64t, "f2", bi64t, "f3",
+                               nullptr); // large struct, pass by reference
+  BFunctionType *befty = mkFuncTyp(be, L_PARM, bst, L_END);
+  Bfunction *func = h.mkFunction("foo", befty);
+
+  // function with no code
+
+  bool broken = h.finish(PreserveDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+
+  const char *exp = R"RAW_RESULT(
+    define void @foo(i8* nest %nest.0, { i64, i64, i64 }* %p0) #0 {
+    entry:
+      call void @llvm.dbg.declare(metadata { i64, i64, i64 }* %p0, metadata !5,
+                                  metadata !DIExpression()), !dbg !18
+      ret void
+    }
+  )RAW_RESULT";
+
+  bool isOK = h.expectValue(func->function(), exp);
+  EXPECT_TRUE(isOK && "Function does not have expected contents");
+}
+
 // This test is designed to make sure that debug meta-data generation
 // handles corner clases like vars with zero size (empty struct).
 
diff --git a/unittests/BackendCore/BackendExprTests.cpp b/unittests/BackendCore/BackendExprTests.cpp
index b34ac85..3343734 100644
--- a/unittests/BackendCore/BackendExprTests.cpp
+++ b/unittests/BackendCore/BackendExprTests.cpp
@@ -22,7 +22,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendExprTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendExprTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
@@ -1500,6 +1500,82 @@
   EXPECT_TRUE(isOK && "Block does not have expected contents");
 }
 
+TEST(BackendExprTests, TestConditionalExpression3Arm64) {
+  FcnTestHarness h(llvm::CallingConv::ARM_AAPCS);
+  Llvm_backend *be = h.be();
+  Btype *bi32t = be->integer_type(false, 32);
+  Btype *abt = be->array_type(bi32t, mkInt64Const(be, int64_t(16)));
+  // type s2t struct {
+  //   f1 [16]int32
+  //   f2 int32
+  // }
+  Btype *s2t = mkBackendStruct(be, abt, "f1", bi32t, "f2", nullptr);
+
+  // func foo(p0v s2t, p1v int32) (a s2t)
+  BFunctionType *befty1 =
+      mkFuncTyp(be, L_RES, s2t, L_PARM, s2t, L_PARM, bi32t, L_END);
+  Bfunction *func = h.mkFunction("foo", befty1);
+  Location loc;
+
+  // Local var with conditional expression as init
+  Bvariable *p0v = func->getNthParamVar(0);
+  Bvariable *p1v = func->getNthParamVar(1);
+  Bexpression *vep1 = be->var_expression(p1v, loc);
+  // p1v < 7
+  Bexpression *cmp = be->binary_expression(OPERATOR_LT, vep1,
+                                           mkInt32Const(be, int32_t(7)), loc);
+  Bexpression *vep0 = be->var_expression(p0v, loc);
+  Bexpression *bzero = be->zero_expression(s2t);
+
+  // if (p1v < 7) {
+  //   a = p0v
+  // } else {
+  //   a = nil
+  // }
+  Bexpression *cond =
+      be->conditional_expression(func, s2t, cmp, vep0, bzero, loc);
+  h.mkLocal("a", s2t, cond);
+
+  const char *exp = R"RAW_RESULT(
+    define void @foo({ [16 x i32], i32 }* sret %sret.formal.0, i8* nest %nest.0, { [16 x i32], i32 }* %p0, i32 %p1) #0 {
+    entry:
+      %p1.addr = alloca i32
+      %a = alloca { [16 x i32], i32 }
+      %tmpv.0 = alloca { [16 x i32], i32 }
+      store i32 %p1, i32* %p1.addr
+      %p1.ld.0 = load i32, i32* %p1.addr
+      %icmp.0 = icmp slt i32 %p1.ld.0, 7
+      %zext.0 = zext i1 %icmp.0 to i8
+      %trunc.0 = trunc i8 %zext.0 to i1
+      br i1 %trunc.0, label %then.0, label %else.0
+
+    then.0:                                           ; preds = %entry
+      %cast.0 = bitcast { [16 x i32], i32 }* %tmpv.0 to i8*
+      %cast.1 = bitcast { [16 x i32], i32 }* %p0 to i8*
+      call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %cast.0, i8* align 4 %cast.1, i64 68, i1 false)
+      br label %fallthrough.0
+
+    fallthrough.0:                                    ; preds = %else.0, %then.0
+      %cast.4 = bitcast { [16 x i32], i32 }* %a to i8*
+      %cast.5 = bitcast { [16 x i32], i32 }* %tmpv.0 to i8*
+      call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %cast.4, i8* align 4 %cast.5, i64 68, i1 false)
+      ret void
+
+    else.0:                                           ; preds = %entry
+      %cast.2 = bitcast { [16 x i32], i32 }* %tmpv.0 to i8*
+      %cast.3 = bitcast { [16 x i32], i32 }* @const.0 to i8*
+      call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %cast.2, i8* align 4 %cast.3, i64 68, i1 false)
+      br label %fallthrough.0
+    }
+  )RAW_RESULT";
+
+  bool broken = h.finish(StripDebugInfo);
+  EXPECT_FALSE(broken && "Module failed to verify.");
+
+  bool isOK = h.expectValue(func->function(), exp);
+  EXPECT_TRUE(isOK && "Block does not have expected contents");
+}
+
 TEST_P(BackendExprTests, TestCompoundExpression) {
   auto cc = GetParam();
   FcnTestHarness h(cc, "foo");
diff --git a/unittests/BackendCore/BackendFcnTests.cpp b/unittests/BackendCore/BackendFcnTests.cpp
index 8a7e9bb..dd5bc52 100644
--- a/unittests/BackendCore/BackendFcnTests.cpp
+++ b/unittests/BackendCore/BackendFcnTests.cpp
@@ -20,7 +20,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendFcnTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendFcnTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendNodeTests.cpp b/unittests/BackendCore/BackendNodeTests.cpp
index 83416ea..dbed932 100644
--- a/unittests/BackendCore/BackendNodeTests.cpp
+++ b/unittests/BackendCore/BackendNodeTests.cpp
@@ -23,7 +23,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendNodeTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendNodeTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendPointerExprTests.cpp b/unittests/BackendCore/BackendPointerExprTests.cpp
index e8a24fe..2cc63fd 100644
--- a/unittests/BackendCore/BackendPointerExprTests.cpp
+++ b/unittests/BackendCore/BackendPointerExprTests.cpp
@@ -22,7 +22,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendPointerExprTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendPointerExprTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendStmtTests.cpp b/unittests/BackendCore/BackendStmtTests.cpp
index 0b07dcb..f00301b 100644
--- a/unittests/BackendCore/BackendStmtTests.cpp
+++ b/unittests/BackendCore/BackendStmtTests.cpp
@@ -20,7 +20,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendStmtTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendStmtTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendTreeIntegrity.cpp b/unittests/BackendCore/BackendTreeIntegrity.cpp
index 06b9e62..0c31828 100644
--- a/unittests/BackendCore/BackendTreeIntegrity.cpp
+++ b/unittests/BackendCore/BackendTreeIntegrity.cpp
@@ -20,7 +20,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendTreeIntegrity,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendTreeIntegrity::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/BackendVarTests.cpp b/unittests/BackendCore/BackendVarTests.cpp
index 149cbc5..ef2fbd4 100644
--- a/unittests/BackendCore/BackendVarTests.cpp
+++ b/unittests/BackendCore/BackendVarTests.cpp
@@ -24,7 +24,7 @@
 
 INSTANTIATE_TEST_CASE_P(
     UnitTest, BackendVarTests,
-    testing::Values(llvm::CallingConv::X86_64_SysV),
+    goBackendUnitTests::CConvs,
     [](const testing::TestParamInfo<BackendVarTests::ParamType> &info) {
       std::string name = goBackendUnitTests::ccName(info.param);
       return name;
diff --git a/unittests/BackendCore/TestUtils.h b/unittests/BackendCore/TestUtils.h
index a75202e..313af8e 100644
--- a/unittests/BackendCore/TestUtils.h
+++ b/unittests/BackendCore/TestUtils.h
@@ -27,11 +27,16 @@
 #include "DiffUtils.h"
 
 #include <stdarg.h>
+#include "gtest/gtest.h"
 
 #define RAW_RESULT(x) #x
 
 namespace goBackendUnitTests {
 
+// All supported calling conventions
+auto CConvs = testing::Values(llvm::CallingConv::X86_64_SysV,
+                              llvm::CallingConv::ARM_AAPCS);
+
 // Convert llvm::CallingConv::ID to its coresponding string name.
 std::string ccName(llvm::CallingConv::ID);
 
diff --git a/unittests/DriverUtils/DriverUtilsTests.cpp b/unittests/DriverUtils/DriverUtilsTests.cpp
index 643e8a8..d6e2fd2 100644
--- a/unittests/DriverUtils/DriverUtilsTests.cpp
+++ b/unittests/DriverUtils/DriverUtilsTests.cpp
@@ -279,7 +279,7 @@
   GCCInstallationDetector detector_;
 };
 
-TEST(DriverUtilsTests, GCCInstallationDetectorBasic) {
+TEST(DriverUtilsTests, GCCInstallationDetectorBasicAmd64) {
 
   // Here we have two installations, version 6 and version 7.
   const char *install = R"RAW_RESULT(
@@ -317,7 +317,32 @@
   EXPECT_TRUE(isOK2);
 }
 
-TEST(DriverUtilsTests, GCCInstallationDetectorSysRoot) {
+TEST(DriverUtilsTests, GCCInstallationDetectorBasicARM64) {
+
+  // Here we have two installations, version 6 and version 7.
+  const char *install = R"RAW_RESULT(
+      /mumble
+      /usr/lib/gcc/aarch64-linux-gnu/blah
+      /usr/lib/gcc/aarch64-linux-gnu/6/crtbegin.o
+      /usr/lib/gcc/aarch64-linux-gnu/7/crtbegin.o
+    )RAW_RESULT";
+
+  // Case 1: no sysroot, looking for 64-bit compiler.
+  // Gcc doesn't support multilib on Arm64, so don't need
+  // to test that case.
+  DetectorHarness harness1(install, "aarch64-linux-gnu", "");
+  const char *exp64 = R"RAW_RESULT(
+      version: 7
+      foundTriple: aarch64-linux-gnu
+      libPath: /usr/lib/gcc/aarch64-linux-gnu/7
+      parentLibPath: /usr/lib/gcc/aarch64-linux-gnu/7/../..
+      installPath: /usr/lib/gcc/aarch64-linux-gnu/7
+    )RAW_RESULT";
+  bool isOK1 = expectToString(harness1.detector(), exp64);
+  EXPECT_TRUE(isOK1);
+}
+
+TEST(DriverUtilsTests, GCCInstallationDetectorSysRootAmd64) {
 
   const char *install = R"RAW_RESULT(
       /mumble
@@ -339,7 +364,29 @@
   EXPECT_TRUE(isOK1);
 }
 
-TEST(DriverUtilsTests, GCCInstallationDetectorTripleAliases) {
+TEST(DriverUtilsTests, GCCInstallationDetectorSysRootARM64) {
+
+  const char *install = R"RAW_RESULT(
+      /mumble
+      /usr/lib/gcc/aarch64-linux-gnu/7/crtbegin.o
+      /mysysroot/usr/lib/gcc/aarch64-linux-gnu/6.2.3/crtbegin.o
+    )RAW_RESULT";
+
+  // We have GCC 7 installed on the host, but GCC 6 in sysroot,
+  // which in this case is what we want.
+  DetectorHarness harness1(install, "aarch64-linux-gnu", "/mysysroot");
+  const char *exp64 = R"RAW_RESULT(
+      version: 6.2.3
+      foundTriple: aarch64-linux-gnu
+      libPath: /mysysroot/usr/lib/gcc/aarch64-linux-gnu/6.2.3
+      parentLibPath: /mysysroot/usr/lib/gcc/aarch64-linux-gnu/6.2.3/../..
+      installPath: /mysysroot/usr/lib/gcc/aarch64-linux-gnu/6.2.3
+    )RAW_RESULT";
+  bool isOK1 = expectToString(harness1.detector(), exp64);
+  EXPECT_TRUE(isOK1);
+}
+
+TEST(DriverUtilsTests, GCCInstallationDetectorTripleAliasesAmd64) {
 
   // Regrettably, there is a fair amount of variation in terms
   // of target triples and how GCC is installed. This test checks
@@ -378,8 +425,46 @@
   EXPECT_TRUE(isOK2);
 }
 
-TEST(DriverUtilsTests, GCCInstallationDetectorBiarchAliases)
-{
+TEST(DriverUtilsTests, GCCInstallationDetectorTripleAliasesARM64) {
+
+  // Regrettably, there is a fair amount of variation in terms
+  // of target triples and how GCC is installed. This test checks
+  // to make sure we can accommodate such differences.
+
+  const char *install = R"RAW_RESULT(
+      /mumble
+      /usr/lib/gcc/aarch64-linux-gnu/5/crtbegin.o
+      /usr/lib/gcc/aarch64-linux-gnu/7/crtbegin.o
+    )RAW_RESULT";
+
+  // Case 1: install is aarch64-linux-gnu, but we are looking for
+  // aarch64-unknown-linux-gnu
+  DetectorHarness harness1(install, "aarch64-unknown-linux-gnu", "");
+  const char *exp1 = R"RAW_RESULT(
+      version: 7
+      foundTriple: aarch64-linux-gnu
+      libPath: /usr/lib/gcc/aarch64-linux-gnu/7
+      parentLibPath: /usr/lib/gcc/aarch64-linux-gnu/7/../..
+      installPath: /usr/lib/gcc/aarch64-linux-gnu/7
+    )RAW_RESULT";
+  bool isOK1 = expectToString(harness1.detector(), exp1);
+  EXPECT_TRUE(isOK1);
+
+  // Case 2: install is aarch64-linux-gnu, but we are looking for
+  // aarch64-redhat-linux-gnu
+  DetectorHarness harness2(install, "aarch64-redhat-linux-gnu", "");
+  const char *exp2 = R"RAW_RESULT(
+      version: 7
+      foundTriple: aarch64-linux-gnu
+      libPath: /usr/lib/gcc/aarch64-linux-gnu/7
+      parentLibPath: /usr/lib/gcc/aarch64-linux-gnu/7/../..
+      installPath: /usr/lib/gcc/aarch64-linux-gnu/7
+    )RAW_RESULT";
+  bool isOK2 = expectToString(harness2.detector(), exp2);
+  EXPECT_TRUE(isOK2);
+}
+
+TEST(DriverUtilsTests, GCCInstallationDetectorBiarchAliasesAmd64) {
   const char *install = R"RAW_RESULT(
       /mumble
       /usr/lib/gcc/x86_64-redhat-linux/6/lib32/crtbegin.o