//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is an optimization pass for variadic functions. If called from codegen,
// it can serve as the implementation of variadic functions for a given target.
//
// The strategy is to turn the ... part of a variadic function into a va_list
// and fix up the call sites. The majority of the pass is target independent.
// The exceptions are the va_list type itself and the rules for where to store
// variables in memory such that va_arg can iterate over them given a va_list.
//
// The majority of the plumbing is splitting the variadic function into a
// single basic block that packs the variadic arguments into a va_list and
// a second function that does the work of the original. That packing is
// exactly what is done by va_start. Further, the transform from ... to va_list
// replaced va_start with an operation to copy a va_list from the new argument,
// which is exactly a va_copy. This is useful for reducing target-dependence.
//
// A va_list instance is a forward iterator, where the primary operation va_arg
// is dereference-then-increment. This interface forces significant convergent
// evolution between target specific implementations. The variation in runtime
// data layout is limited to that representable by the iterator, parameterised
// by the type passed to the va_arg instruction.
//
// Therefore the majority of the target specific subtlety is packing arguments
// into a stack allocated buffer such that a va_list can be initialised with it
// and the va_arg expansion for the target will find the arguments at runtime.
//
// The aggregate effect is to unblock other transforms, most critically the
// general purpose inliner. Known calls to variadic functions become zero cost.
//
// Consistency with clang is primarily tested by emitting va_arg using clang
// then expanding the variadic functions using this pass, followed by trying
// to constant fold the functions to no-ops.
//
// Target specific behaviour is tested in IR - mainly checking that values are
// put into positions in call frames that make sense for that particular target.
//
// There is one "clever" invariant in use. va_start intrinsics that are not
// within a varidic functions are an error in the IR verifier. When this
// transform moves blocks from a variadic function into a fixed arity one, it
// moves va_start intrinsics along with everything else. That means that the
// va_start intrinsics that need to be rewritten to use the trailing argument
// are exactly those that are in non-variadic functions so no further state
// is needed to distinguish those that need to be rewritten.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/IPO/ExpandVariadics.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"

#define DEBUG_TYPE "expand-variadics"

using namespace llvm;

namespace {

cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
    DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
    cl::init(ExpandVariadicsMode::Unspecified),
    cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
                          "Use the implementation defaults"),
               clEnumValN(ExpandVariadicsMode::Disable, "disable",
                          "Disable the pass entirely"),
               clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
                          "Optimise without changing ABI"),
               clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
                          "Change variadic calling convention")));

bool commandLineOverride() {
  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
}

// Instances of this class encapsulate the target-dependant behaviour as a
// function of triple. Implementing a new ABI is adding a case to the switch
// in create(llvm::Triple) at the end of this file.
// This class may end up instantiated in TargetMachine instances, keeping it
// here for now until enough targets are implemented for the API to evolve.
class VariadicABIInfo {
protected:
  VariadicABIInfo() = default;

public:
  static std::unique_ptr<VariadicABIInfo> create(const Triple &T);

  // Allow overriding whether the pass runs on a per-target basis
  virtual bool enableForTarget() = 0;

  // Whether a valist instance is passed by value or by address
  // I.e. does it need to be alloca'ed and stored into, or can
  // it be passed directly in a SSA register
  virtual bool vaListPassedInSSARegister() = 0;

  // The type of a va_list iterator object
  virtual Type *vaListType(LLVMContext &Ctx) = 0;

  // The type of a va_list as a function argument as lowered by C
  virtual Type *vaListParameterType(Module &M) = 0;

  // Initialize an allocated va_list object to point to an already
  // initialized contiguous memory region.
  // Return the value to pass as the va_list argument
  virtual Value *initializeVaList(Module &M, LLVMContext &Ctx,
                                  IRBuilder<> &Builder, AllocaInst *VaList,
                                  Value *Buffer) = 0;

  struct VAArgSlotInfo {
    Align DataAlign; // With respect to the call frame
    bool Indirect;   // Passed via a pointer
  };
  virtual VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) = 0;

  // Targets implemented so far all have the same trivial lowering for these
  bool vaEndIsNop() { return true; }
  bool vaCopyIsMemcpy() { return true; }

  virtual ~VariadicABIInfo() = default;
};

class ExpandVariadics : public ModulePass {

  // The pass construction sets the default to optimize when called from middle
  // end and lowering when called from the backend. The command line variable
  // overrides that. This is useful for testing and debugging. It also allows
  // building an applications with variadic functions wholly removed if one
  // has sufficient control over the dependencies, e.g. a statically linked
  // clang that has no variadic function calls remaining in the binary.

public:
  static char ID;
  const ExpandVariadicsMode Mode;
  std::unique_ptr<VariadicABIInfo> ABI;

  ExpandVariadics(ExpandVariadicsMode Mode)
      : ModulePass(ID),
        Mode(commandLineOverride() ? ExpandVariadicsModeOption : Mode) {}

  StringRef getPassName() const override { return "Expand variadic functions"; }

  bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; }

  bool runOnModule(Module &M) override;

  bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);

  Function *replaceAllUsesWithNewDeclaration(Module &M,
                                             Function *OriginalFunction);

  Function *deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
                                        Function *OriginalFunction);

  Function *defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
                                  Function *VariadicWrapper,
                                  Function *FixedArityReplacement);

  bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
                  Function *NF);

  // The intrinsic functions va_copy and va_end are removed unconditionally.
  // They correspond to a memcpy and a no-op on all implemented targets.
  // The va_start intrinsic is removed from basic blocks that were not created
  // by this pass, some may remain if needed to maintain the external ABI.

  template <Intrinsic::ID ID, typename InstructionType>
  bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
                            PointerType *IntrinsicArgType) {
    bool Changed = false;
    const DataLayout &DL = M.getDataLayout();
    if (Function *Intrinsic =
            Intrinsic::getDeclarationIfExists(&M, ID, {IntrinsicArgType})) {
      for (User *U : make_early_inc_range(Intrinsic->users()))
        if (auto *I = dyn_cast<InstructionType>(U))
          Changed |= expandVAIntrinsicCall(Builder, DL, I);

      if (Intrinsic->use_empty())
        Intrinsic->eraseFromParent();
    }
    return Changed;
  }

  bool expandVAIntrinsicUsersWithAddrspace(Module &M, IRBuilder<> &Builder,
                                           unsigned Addrspace) {
    auto &Ctx = M.getContext();
    PointerType *IntrinsicArgType = PointerType::get(Ctx, Addrspace);
    bool Changed = false;

    // expand vastart before vacopy as vastart may introduce a vacopy
    Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(
        M, Builder, IntrinsicArgType);
    Changed |= expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(
        M, Builder, IntrinsicArgType);
    Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(
        M, Builder, IntrinsicArgType);
    return Changed;
  }

  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
                             VAStartInst *Inst);

  bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
                             VAEndInst *Inst);

  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
                             VACopyInst *Inst);

  FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
    // The type of "FTy" with the ... removed and a va_list appended
    SmallVector<Type *> ArgTypes(FTy->params());
    ArgTypes.push_back(ABI->vaListParameterType(M));
    return FunctionType::get(FTy->getReturnType(), ArgTypes,
                             /*IsVarArgs=*/false);
  }

  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
                                   AllocaInst *Alloced) {
    std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
    uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
  }

  bool expansionApplicableToFunction(Module &M, Function *F) {
    if (F->isIntrinsic() || !F->isVarArg() ||
        F->hasFnAttribute(Attribute::Naked))
      return false;

    if (F->getCallingConv() != CallingConv::C)
      return false;

    if (rewriteABI())
      return true;

    if (!F->hasExactDefinition())
      return false;

    return true;
  }

  bool expansionApplicableToFunctionCall(CallBase *CB) {
    if (CallInst *CI = dyn_cast<CallInst>(CB)) {
      if (CI->isMustTailCall()) {
        // Cannot expand musttail calls
        return false;
      }

      if (CI->getCallingConv() != CallingConv::C)
        return false;

      return true;
    }

    if (isa<InvokeInst>(CB)) {
      // Invoke not implemented in initial implementation of pass
      return false;
    }

    // Other unimplemented derivative of CallBase
    return false;
  }

  class ExpandedCallFrame {
    // Helper for constructing an alloca instance containing the arguments bound
    // to the variadic ... parameter, rearranged to allow indexing through a
    // va_list iterator
    enum { N = 4 };
    SmallVector<Type *, N> FieldTypes;
    enum Tag { Store, Memcpy, Padding };
    SmallVector<std::tuple<Value *, uint64_t, Tag>, N> Source;

    template <Tag tag> void append(Type *FieldType, Value *V, uint64_t Bytes) {
      FieldTypes.push_back(FieldType);
      Source.push_back({V, Bytes, tag});
    }

  public:
    void store(LLVMContext &Ctx, Type *T, Value *V) { append<Store>(T, V, 0); }

    void memcpy(LLVMContext &Ctx, Type *T, Value *V, uint64_t Bytes) {
      append<Memcpy>(T, V, Bytes);
    }

    void padding(LLVMContext &Ctx, uint64_t By) {
      append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr, 0);
    }

    size_t size() const { return FieldTypes.size(); }
    bool empty() const { return FieldTypes.empty(); }

    StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
      const bool IsPacked = true;
      return StructType::create(Ctx, FieldTypes,
                                (Twine(Name) + ".vararg").str(), IsPacked);
    }

    void initializeStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
                                AllocaInst *Alloced) {

      StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());

      for (size_t I = 0; I < size(); I++) {

        auto [V, bytes, tag] = Source[I];

        if (tag == Padding) {
          assert(V == nullptr);
          continue;
        }

        auto Dst = Builder.CreateStructGEP(VarargsTy, Alloced, I);

        assert(V != nullptr);

        if (tag == Store)
          Builder.CreateStore(V, Dst);

        if (tag == Memcpy)
          Builder.CreateMemCpy(Dst, {}, V, {}, bytes);
      }
    }
  };
};

bool ExpandVariadics::runOnModule(Module &M) {
  bool Changed = false;
  if (Mode == ExpandVariadicsMode::Disable)
    return Changed;

  Triple TT(M.getTargetTriple());
  ABI = VariadicABIInfo::create(TT);
  if (!ABI)
    return Changed;

  if (!ABI->enableForTarget())
    return Changed;

  auto &Ctx = M.getContext();
  const DataLayout &DL = M.getDataLayout();
  IRBuilder<> Builder(Ctx);

  // Lowering needs to run on all functions exactly once.
  // Optimize could run on functions containing va_start exactly once.
  for (Function &F : make_early_inc_range(M))
    Changed |= runOnFunction(M, Builder, &F);

  // After runOnFunction, all known calls to known variadic functions have been
  // replaced. va_start intrinsics are presently (and invalidly!) only present
  // in functions that used to be variadic and have now been replaced to take a
  // va_list instead. If lowering as opposed to optimising, calls to unknown
  // variadic functions have also been replaced.

  {
    // 0 and AllocaAddrSpace are sufficient for the targets implemented so far
    unsigned Addrspace = 0;
    Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);

    Addrspace = DL.getAllocaAddrSpace();
    if (Addrspace != 0)
      Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);
  }

  if (Mode != ExpandVariadicsMode::Lowering)
    return Changed;

  for (Function &F : make_early_inc_range(M)) {
    if (F.isDeclaration())
      continue;

    // Now need to track down indirect calls. Can't find those
    // by walking uses of variadic functions, need to crawl the instruction
    // stream. Fortunately this is only necessary for the ABI rewrite case.
    for (BasicBlock &BB : F) {
      for (Instruction &I : make_early_inc_range(BB)) {
        if (CallBase *CB = dyn_cast<CallBase>(&I)) {
          if (CB->isIndirectCall()) {
            FunctionType *FTy = CB->getFunctionType();
            if (FTy->isVarArg())
              Changed |= expandCall(M, Builder, CB, FTy, 0);
          }
        }
      }
    }
  }

  return Changed;
}

bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
                                    Function *OriginalFunction) {
  bool Changed = false;

  if (!expansionApplicableToFunction(M, OriginalFunction))
    return Changed;

  [[maybe_unused]] const bool OriginalFunctionIsDeclaration =
      OriginalFunction->isDeclaration();
  assert(rewriteABI() || !OriginalFunctionIsDeclaration);

  // Declare a new function and redirect every use to that new function
  Function *VariadicWrapper =
      replaceAllUsesWithNewDeclaration(M, OriginalFunction);
  assert(VariadicWrapper->isDeclaration());
  assert(OriginalFunction->use_empty());

  // Create a new function taking va_list containing the implementation of the
  // original
  Function *FixedArityReplacement =
      deriveFixedArityReplacement(M, Builder, OriginalFunction);
  assert(OriginalFunction->isDeclaration());
  assert(FixedArityReplacement->isDeclaration() ==
         OriginalFunctionIsDeclaration);
  assert(VariadicWrapper->isDeclaration());

  // Create a single block forwarding wrapper that turns a ... into a va_list
  [[maybe_unused]] Function *VariadicWrapperDefine =
      defineVariadicWrapper(M, Builder, VariadicWrapper, FixedArityReplacement);
  assert(VariadicWrapperDefine == VariadicWrapper);
  assert(!VariadicWrapper->isDeclaration());

  // We now have:
  // 1. the original function, now as a declaration with no uses
  // 2. a variadic function that unconditionally calls a fixed arity replacement
  // 3. a fixed arity function equivalent to the original function

  // Replace known calls to the variadic with calls to the va_list equivalent
  for (User *U : make_early_inc_range(VariadicWrapper->users())) {
    if (CallBase *CB = dyn_cast<CallBase>(U)) {
      Value *CalledOperand = CB->getCalledOperand();
      if (VariadicWrapper == CalledOperand)
        Changed |=
            expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(),
                       FixedArityReplacement);
    }
  }

  // The original function will be erased.
  // One of the two new functions will become a replacement for the original.
  // When preserving the ABI, the other is an internal implementation detail.
  // When rewriting the ABI, RAUW then the variadic one.
  Function *const ExternallyAccessible =
      rewriteABI() ? FixedArityReplacement : VariadicWrapper;
  Function *const InternalOnly =
      rewriteABI() ? VariadicWrapper : FixedArityReplacement;

  // The external function is the replacement for the original
  ExternallyAccessible->setLinkage(OriginalFunction->getLinkage());
  ExternallyAccessible->setVisibility(OriginalFunction->getVisibility());
  ExternallyAccessible->setComdat(OriginalFunction->getComdat());
  ExternallyAccessible->takeName(OriginalFunction);

  // Annotate the internal one as internal
  InternalOnly->setVisibility(GlobalValue::DefaultVisibility);
  InternalOnly->setLinkage(GlobalValue::InternalLinkage);

  // The original is unused and obsolete
  OriginalFunction->eraseFromParent();

  InternalOnly->removeDeadConstantUsers();

  if (rewriteABI()) {
    // All known calls to the function have been removed by expandCall
    // Resolve everything else by replaceAllUsesWith
    VariadicWrapper->replaceAllUsesWith(FixedArityReplacement);
    VariadicWrapper->eraseFromParent();
  }

  return Changed;
}

Function *
ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M,
                                                  Function *OriginalFunction) {
  auto &Ctx = M.getContext();
  Function &F = *OriginalFunction;
  FunctionType *FTy = F.getFunctionType();
  Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace());

  NF->setName(F.getName() + ".varargs");

  F.getParent()->getFunctionList().insert(F.getIterator(), NF);

  AttrBuilder ParamAttrs(Ctx);
  AttributeList Attrs = NF->getAttributes();
  Attrs = Attrs.addParamAttributes(Ctx, FTy->getNumParams(), ParamAttrs);
  NF->setAttributes(Attrs);

  OriginalFunction->replaceAllUsesWith(NF);
  return NF;
}

Function *
ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
                                             Function *OriginalFunction) {
  Function &F = *OriginalFunction;
  // The purpose here is split the variadic function F into two functions
  // One is a variadic function that bundles the passed argument into a va_list
  // and passes it to the second function. The second function does whatever
  // the original F does, except that it takes a va_list instead of the ...

  assert(expansionApplicableToFunction(M, &F));

  auto &Ctx = M.getContext();

  // Returned value isDeclaration() is equal to F.isDeclaration()
  // but that property is not invariant throughout this function
  const bool FunctionIsDefinition = !F.isDeclaration();

  FunctionType *FTy = F.getFunctionType();
  SmallVector<Type *> ArgTypes(FTy->params());
  ArgTypes.push_back(ABI->vaListParameterType(M));

  FunctionType *NFTy = inlinableVariadicFunctionType(M, FTy);
  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());

  // Note - same attribute handling as DeadArgumentElimination
  NF->copyAttributesFrom(&F);
  NF->setComdat(F.getComdat());
  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
  NF->setName(F.getName() + ".valist");

  AttrBuilder ParamAttrs(Ctx);

  AttributeList Attrs = NF->getAttributes();
  Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
  NF->setAttributes(Attrs);

  // Splice the implementation into the new function with minimal changes
  if (FunctionIsDefinition) {
    NF->splice(NF->begin(), &F);

    auto NewArg = NF->arg_begin();
    for (Argument &Arg : F.args()) {
      Arg.replaceAllUsesWith(NewArg);
      NewArg->setName(Arg.getName()); // takeName without killing the old one
      ++NewArg;
    }
    NewArg->setName("varargs");
  }

  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
  F.getAllMetadata(MDs);
  for (auto [KindID, Node] : MDs)
    NF->addMetadata(KindID, *Node);
  F.clearMetadata();

  return NF;
}

Function *
ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
                                       Function *VariadicWrapper,
                                       Function *FixedArityReplacement) {
  auto &Ctx = Builder.getContext();
  const DataLayout &DL = M.getDataLayout();
  assert(VariadicWrapper->isDeclaration());
  Function &F = *VariadicWrapper;

  assert(F.isDeclaration());
  Type *VaListTy = ABI->vaListType(Ctx);

  auto *BB = BasicBlock::Create(Ctx, "entry", &F);
  Builder.SetInsertPoint(BB);

  AllocaInst *VaListInstance =
      Builder.CreateAlloca(VaListTy, nullptr, "va_start");

  Builder.CreateLifetimeStart(VaListInstance,
                              sizeOfAlloca(Ctx, DL, VaListInstance));

  Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
                          {VaListInstance});

  SmallVector<Value *> Args(llvm::make_pointer_range(F.args()));

  Type *ParameterType = ABI->vaListParameterType(M);
  if (ABI->vaListPassedInSSARegister())
    Args.push_back(Builder.CreateLoad(ParameterType, VaListInstance));
  else
    Args.push_back(Builder.CreateAddrSpaceCast(VaListInstance, ParameterType));

  CallInst *Result = Builder.CreateCall(FixedArityReplacement, Args);

  Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
                          {VaListInstance});
  Builder.CreateLifetimeEnd(VaListInstance,
                            sizeOfAlloca(Ctx, DL, VaListInstance));

  if (Result->getType()->isVoidTy())
    Builder.CreateRetVoid();
  else
    Builder.CreateRet(Result);

  return VariadicWrapper;
}

bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
                                 FunctionType *VarargFunctionType,
                                 Function *NF) {
  bool Changed = false;
  const DataLayout &DL = M.getDataLayout();

  if (!expansionApplicableToFunctionCall(CB)) {
    if (rewriteABI())
      report_fatal_error("Cannot lower callbase instruction");
    return Changed;
  }

  // This is tricky. The call instruction's function type might not match
  // the type of the caller. When optimising, can leave it unchanged.
  // Webassembly detects that inconsistency and repairs it.
  FunctionType *FuncType = CB->getFunctionType();
  if (FuncType != VarargFunctionType) {
    if (!rewriteABI())
      return Changed;
    FuncType = VarargFunctionType;
  }

  auto &Ctx = CB->getContext();

  Align MaxFieldAlign(1);

  // The strategy is to allocate a call frame containing the variadic
  // arguments laid out such that a target specific va_list can be initialized
  // with it, such that target specific va_arg instructions will correctly
  // iterate over it. This means getting the alignment right and sometimes
  // embedding a pointer to the value instead of embedding the value itself.

  Function *CBF = CB->getParent()->getParent();

  ExpandedCallFrame Frame;

  uint64_t CurrentOffset = 0;

  for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
    Value *ArgVal = CB->getArgOperand(I);
    const bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
    const bool IsByRef = CB->paramHasAttr(I, Attribute::ByRef);

    // The type of the value being passed, decoded from byval/byref metadata if
    // required
    Type *const UnderlyingType = IsByVal   ? CB->getParamByValType(I)
                                 : IsByRef ? CB->getParamByRefType(I)
                                           : ArgVal->getType();
    const uint64_t UnderlyingSize =
        DL.getTypeAllocSize(UnderlyingType).getFixedValue();

    // The type to be written into the call frame
    Type *FrameFieldType = UnderlyingType;

    // The value to copy from when initialising the frame alloca
    Value *SourceValue = ArgVal;

    VariadicABIInfo::VAArgSlotInfo SlotInfo = ABI->slotInfo(DL, UnderlyingType);

    if (SlotInfo.Indirect) {
      // The va_arg lowering loads through a pointer. Set up an alloca to aim
      // that pointer at.
      Builder.SetInsertPointPastAllocas(CBF);
      Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
      Value *CallerCopy =
          Builder.CreateAlloca(UnderlyingType, nullptr, "IndirectAlloca");

      Builder.SetInsertPoint(CB);
      if (IsByVal)
        Builder.CreateMemCpy(CallerCopy, {}, ArgVal, {}, UnderlyingSize);
      else
        Builder.CreateStore(ArgVal, CallerCopy);

      // Indirection now handled, pass the alloca ptr by value
      FrameFieldType = DL.getAllocaPtrType(Ctx);
      SourceValue = CallerCopy;
    }

    // Alignment of the value within the frame
    // This probably needs to be controllable as a function of type
    Align DataAlign = SlotInfo.DataAlign;

    MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);

    uint64_t DataAlignV = DataAlign.value();
    if (uint64_t Rem = CurrentOffset % DataAlignV) {
      // Inject explicit padding to deal with alignment requirements
      uint64_t Padding = DataAlignV - Rem;
      Frame.padding(Ctx, Padding);
      CurrentOffset += Padding;
    }

    if (SlotInfo.Indirect) {
      Frame.store(Ctx, FrameFieldType, SourceValue);
    } else {
      if (IsByVal)
        Frame.memcpy(Ctx, FrameFieldType, SourceValue, UnderlyingSize);
      else
        Frame.store(Ctx, FrameFieldType, SourceValue);
    }

    CurrentOffset += DL.getTypeAllocSize(FrameFieldType).getFixedValue();
  }

  if (Frame.empty()) {
    // Not passing any arguments, hopefully va_arg won't try to read any
    // Creating a single byte frame containing nothing to point the va_list
    // instance as that is less special-casey in the compiler and probably
    // easier to interpret in a debugger.
    Frame.padding(Ctx, 1);
  }

  StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());

  // The struct instance needs to be at least MaxFieldAlign for the alignment of
  // the fields to be correct at runtime. Use the native stack alignment instead
  // if that's greater as that tends to give better codegen.
  // This is an awkward way to guess whether there is a known stack alignment
  // without hitting an assert in DL.getStackAlignment, 1024 is an arbitrary
  // number likely to be greater than the natural stack alignment.
  Align AllocaAlign = MaxFieldAlign;
  if (MaybeAlign StackAlign = DL.getStackAlignment();
      StackAlign && *StackAlign > AllocaAlign)
    AllocaAlign = *StackAlign;

  // Put the alloca to hold the variadic args in the entry basic block.
  Builder.SetInsertPointPastAllocas(CBF);

  // SetCurrentDebugLocation when the builder SetInsertPoint method does not
  Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());

  // The awkward construction here is to set the alignment on the instance
  AllocaInst *Alloced = Builder.Insert(
      new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign),
      "vararg_buffer");
  Changed = true;
  assert(Alloced->getAllocatedType() == VarargsTy);

  // Initialize the fields in the struct
  Builder.SetInsertPoint(CB);
  Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
  Frame.initializeStructAlloca(DL, Builder, Alloced);

  const unsigned NumArgs = FuncType->getNumParams();
  SmallVector<Value *> Args(CB->arg_begin(), CB->arg_begin() + NumArgs);

  // Initialize a va_list pointing to that struct and pass it as the last
  // argument
  AllocaInst *VaList = nullptr;
  {
    if (!ABI->vaListPassedInSSARegister()) {
      Type *VaListTy = ABI->vaListType(Ctx);
      Builder.SetInsertPointPastAllocas(CBF);
      Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
      VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
      Builder.SetInsertPoint(CB);
      Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
    }
    Builder.SetInsertPoint(CB);
    Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
  }

  // Attributes excluding any on the vararg arguments
  AttributeList PAL = CB->getAttributes();
  if (!PAL.isEmpty()) {
    SmallVector<AttributeSet, 8> ArgAttrs;
    for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++)
      ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
    PAL =
        AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs);
  }

  SmallVector<OperandBundleDef, 1> OpBundles;
  CB->getOperandBundlesAsDefs(OpBundles);

  CallBase *NewCB = nullptr;

  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
    Value *Dst = NF ? NF : CI->getCalledOperand();
    FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);

    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI->getIterator());

    CallInst::TailCallKind TCK = CI->getTailCallKind();
    assert(TCK != CallInst::TCK_MustTail);

    // Can't tail call a function that is being passed a pointer to an alloca
    if (TCK == CallInst::TCK_Tail)
      TCK = CallInst::TCK_None;
    CI->setTailCallKind(TCK);

  } else {
    llvm_unreachable("Unreachable when !expansionApplicableToFunctionCall()");
  }

  if (VaList)
    Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));

  Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));

  NewCB->setAttributes(PAL);
  NewCB->takeName(CB);
  NewCB->setCallingConv(CB->getCallingConv());
  NewCB->setDebugLoc(DebugLoc());

  // DeadArgElim and ArgPromotion copy exactly this metadata
  NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});

  CB->replaceAllUsesWith(NewCB);
  CB->eraseFromParent();
  return Changed;
}

bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
                                            const DataLayout &DL,
                                            VAStartInst *Inst) {
  // Only removing va_start instructions that are not in variadic functions.
  // Those would be rejected by the IR verifier before this pass.
  // After splicing basic blocks from a variadic function into a fixed arity
  // one the va_start that used to refer to the ... parameter still exist.
  // There are also variadic functions that this pass did not change and
  // va_start instances in the created single block wrapper functions.
  // Replace exactly the instances in non-variadic functions as those are
  // the ones to be fixed up to use the va_list passed as the final argument.

  Function *ContainingFunction = Inst->getFunction();
  if (ContainingFunction->isVarArg()) {
    return false;
  }

  // The last argument is a vaListParameterType, either a va_list
  // or a pointer to one depending on the target.
  bool PassedByValue = ABI->vaListPassedInSSARegister();
  Argument *PassedVaList =
      ContainingFunction->getArg(ContainingFunction->arg_size() - 1);

  // va_start takes a pointer to a va_list, e.g. one on the stack
  Value *VaStartArg = Inst->getArgList();

  Builder.SetInsertPoint(Inst);

  if (PassedByValue) {
    // The general thing to do is create an alloca, store the va_list argument
    // to it, then create a va_copy. When vaCopyIsMemcpy(), this optimises to a
    // store to the VaStartArg.
    assert(ABI->vaCopyIsMemcpy());
    Builder.CreateStore(PassedVaList, VaStartArg);
  } else {

    // Otherwise emit a vacopy to pick up target-specific handling if any
    auto &Ctx = Builder.getContext();

    Builder.CreateIntrinsic(Intrinsic::vacopy, {DL.getAllocaPtrType(Ctx)},
                            {VaStartArg, PassedVaList});
  }

  Inst->eraseFromParent();
  return true;
}

bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
                                            VAEndInst *Inst) {
  assert(ABI->vaEndIsNop());
  Inst->eraseFromParent();
  return true;
}

bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
                                            const DataLayout &DL,
                                            VACopyInst *Inst) {
  assert(ABI->vaCopyIsMemcpy());
  Builder.SetInsertPoint(Inst);

  auto &Ctx = Builder.getContext();
  Type *VaListTy = ABI->vaListType(Ctx);
  uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();

  Builder.CreateMemCpy(Inst->getDest(), {}, Inst->getSrc(), {},
                       Builder.getInt32(Size));

  Inst->eraseFromParent();
  return true;
}

struct Amdgpu final : public VariadicABIInfo {

  bool enableForTarget() override { return true; }

  bool vaListPassedInSSARegister() override { return true; }

  Type *vaListType(LLVMContext &Ctx) override {
    return PointerType::getUnqual(Ctx);
  }

  Type *vaListParameterType(Module &M) override {
    return PointerType::getUnqual(M.getContext());
  }

  Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
                          AllocaInst * /*va_list*/, Value *Buffer) override {
    // Given Buffer, which is an AllocInst of vararg_buffer
    // need to return something usable as parameter type
    return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M));
  }

  VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
    return {Align(4), false};
  }
};

struct NVPTX final : public VariadicABIInfo {

  bool enableForTarget() override { return true; }

  bool vaListPassedInSSARegister() override { return true; }

  Type *vaListType(LLVMContext &Ctx) override {
    return PointerType::getUnqual(Ctx);
  }

  Type *vaListParameterType(Module &M) override {
    return PointerType::getUnqual(M.getContext());
  }

  Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
                          AllocaInst *, Value *Buffer) override {
    return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M));
  }

  VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
    // NVPTX expects natural alignment in all cases. The variadic call ABI will
    // handle promoting types to their appropriate size and alignment.
    Align A = DL.getABITypeAlign(Parameter);
    return {A, false};
  }
};

struct Wasm final : public VariadicABIInfo {

  bool enableForTarget() override {
    // Currently wasm is only used for testing.
    return commandLineOverride();
  }

  bool vaListPassedInSSARegister() override { return true; }

  Type *vaListType(LLVMContext &Ctx) override {
    return PointerType::getUnqual(Ctx);
  }

  Type *vaListParameterType(Module &M) override {
    return PointerType::getUnqual(M.getContext());
  }

  Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
                          AllocaInst * /*va_list*/, Value *Buffer) override {
    return Buffer;
  }

  VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
    LLVMContext &Ctx = Parameter->getContext();
    const unsigned MinAlign = 4;
    Align A = DL.getABITypeAlign(Parameter);
    if (A < MinAlign)
      A = Align(MinAlign);

    if (auto *S = dyn_cast<StructType>(Parameter)) {
      if (S->getNumElements() > 1) {
        return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true};
      }
    }

    return {A, false};
  }
};

std::unique_ptr<VariadicABIInfo> VariadicABIInfo::create(const Triple &T) {
  switch (T.getArch()) {
  case Triple::r600:
  case Triple::amdgcn: {
    return std::make_unique<Amdgpu>();
  }

  case Triple::wasm32: {
    return std::make_unique<Wasm>();
  }

  case Triple::nvptx:
  case Triple::nvptx64: {
    return std::make_unique<NVPTX>();
  }

  default:
    return {};
  }
}

} // namespace

char ExpandVariadics::ID = 0;

INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false,
                false)

ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode M) {
  return new ExpandVariadics(M);
}

PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
  return ExpandVariadics(Mode).runOnModule(M) ? PreservedAnalyses::none()
                                              : PreservedAnalyses::all();
}

ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode M) : Mode(M) {}
