//===- AMDGPUSplitModule.cpp ----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file Implements a module splitting algorithm designed to support the /// FullLTO --lto-partitions option for parallel codegen. /// /// The role of this module splitting pass is the same as /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions /// across a set of N partitions to allow for parallel codegen. /// /// The similarities mostly end here, as this pass achieves load-balancing in a /// more elaborate fashion which is targeted towards AMDGPU modules. It can take /// advantage of the structure of AMDGPU modules (which are mostly /// self-contained) to allow for more efficient splitting without affecting /// codegen negatively, or causing innaccurate resource usage analysis. /// /// High-level pass overview: /// - SplitGraph & associated classes /// - Graph representation of the module and of the dependencies that /// matter for splitting. /// - RecursiveSearchSplitting /// - Core splitting algorithm. /// - SplitProposal /// - Represents a suggested solution for splitting the input module. These /// solutions can be scored to determine the best one when multiple /// solutions are available. /// - Driver/pass "run" function glues everything together. #include "AMDGPUSplitModule.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" #include "llvm/Support/DOTGraphTraits.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/Path.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include #include #include #include #include #ifndef NDEBUG #include "llvm/Support/LockFileManager.h" #endif #define DEBUG_TYPE "amdgpu-split-module" namespace llvm { namespace { static cl::opt MaxDepth( "amdgpu-module-splitting-max-depth", cl::desc( "maximum search depth. 0 forces a greedy approach. " "warning: the algorithm is up to O(2^N), where N is the max depth."), cl::init(8)); static cl::opt LargeFnFactor( "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden, cl::desc( "when max depth is reached and we can no longer branch out, this " "value determines if a function is worth merging into an already " "existing partition to reduce code duplication. This is a factor " "of the ideal partition size, e.g. 2.0 means we consider the " "function for merging if its cost (including its callees) is 2x the " "size of an ideal partition.")); static cl::opt LargeFnOverlapForMerge( "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden, cl::desc("when a function is considered for merging into a partition that " "already contains some of its callees, do the merge if at least " "n% of the code it can reach is already present inside the " "partition; e.g. 0.7 means only merge >70%")); static cl::opt NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, cl::desc("disables externalization of global variable with local linkage; " "may cause globals to be duplicated which increases binary size")); static cl::opt NoExternalizeOnAddrTaken( "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden, cl::desc( "disables externalization of functions whose addresses are taken")); static cl::opt ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg", cl::Hidden, cl::desc("output file to write out the dotgraph " "representation of the input module")); static cl::opt PartitionSummariesOutput( "amdgpu-module-splitting-print-partition-summaries", cl::Hidden, cl::desc("output file to write out a summary of " "the partitions created for each module")); #ifndef NDEBUG static cl::opt UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden, cl::desc("use a lock file so only one process in the system " "can run this pass at once. useful to avoid mangled " "debug output in multithreaded environments.")); static cl::opt DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search", cl::Hidden, cl::desc("print all proposals received and whether " "they were rejected or accepted")); #endif struct SplitModuleTimer : NamedRegionTimer { SplitModuleTimer(StringRef Name, StringRef Desc) : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting", TimePassesIsEnabled) {} }; //===----------------------------------------------------------------------===// // Utils //===----------------------------------------------------------------------===// using CostType = InstructionCost::CostType; using FunctionsCostMap = DenseMap; using GetTTIFn = function_ref; static constexpr unsigned InvalidPID = -1; /// \param Num numerator /// \param Dem denominator /// \returns a printable object to print (Num/Dem) using "%0.2f". static auto formatRatioOf(CostType Num, CostType Dem) { CostType DemOr1 = Dem ? Dem : 1; return format("%0.2f", (static_cast(Num) / DemOr1) * 100); } /// Checks whether a given function is non-copyable. /// /// Non-copyable functions cannot be cloned into multiple partitions, and only /// one copy of the function can be present across all partitions. /// /// Kernel functions and external functions fall into this category. If we were /// to clone them, we would end up with multiple symbol definitions and a very /// unhappy linker. static bool isNonCopyable(const Function &F) { return F.hasExternalLinkage() || !F.isDefinitionExact() || AMDGPU::isEntryFunctionCC(F.getCallingConv()); } /// If \p GV has local linkage, make it external + hidden. static void externalize(GlobalValue &GV) { if (GV.hasLocalLinkage()) { GV.setLinkage(GlobalValue::ExternalLinkage); GV.setVisibility(GlobalValue::HiddenVisibility); } // Unnamed entities must be named consistently between modules. setName will // give a distinct name to each such entity. if (!GV.hasName()) GV.setName("__llvmsplit_unnamed"); } /// Cost analysis function. Calculates the cost of each function in \p M /// /// \param GetTTI Abstract getter for TargetTransformInfo. /// \param M Module to analyze. /// \param CostMap[out] Resulting Function -> Cost map. /// \return The module's total cost. static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, FunctionsCostMap &CostMap) { SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis"); LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n"); CostType ModuleCost = 0; [[maybe_unused]] CostType KernelCost = 0; for (auto &Fn : M) { if (Fn.isDeclaration()) continue; CostType FnCost = 0; const auto &TTI = GetTTI(Fn); for (const auto &BB : Fn) { for (const auto &I : BB) { auto Cost = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); assert(Cost != InstructionCost::getMax()); // Assume expensive if we can't tell the cost of an instruction. CostType CostVal = Cost.isValid() ? Cost.getValue() : (CostType)TargetTransformInfo::TCC_Expensive; assert((FnCost + CostVal) >= FnCost && "Overflow!"); FnCost += CostVal; } } assert(FnCost != 0); CostMap[&Fn] = FnCost; assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); ModuleCost += FnCost; if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) KernelCost += FnCost; } if (CostMap.empty()) return 0; assert(ModuleCost); LLVM_DEBUG({ const CostType FnCost = ModuleCost - KernelCost; dbgs() << " - total module cost is " << ModuleCost << ". kernels cost " << "" << KernelCost << " (" << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "% of the module), functions cost " << FnCost << " (" << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "% of the module)\n"; }); return ModuleCost; } /// \return true if \p F can be indirectly called static bool canBeIndirectlyCalled(const Function &F) { if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; return !F.hasLocalLinkage() || F.hasAddressTaken(/*PutOffender=*/nullptr, /*IgnoreCallbackUses=*/false, /*IgnoreAssumeLikeCalls=*/true, /*IgnoreLLVMUsed=*/true, /*IgnoreARCAttachedCall=*/false, /*IgnoreCastedDirectCall=*/true); } //===----------------------------------------------------------------------===// // Graph-based Module Representation //===----------------------------------------------------------------------===// /// AMDGPUSplitModule's view of the source Module, as a graph of all components /// that can be split into different modules. /// /// The most trivial instance of this graph is just the CallGraph of the module, /// but it is not guaranteed that the graph is strictly equal to the CG. It /// currently always is but it's designed in a way that would eventually allow /// us to create abstract nodes, or nodes for different entities such as global /// variables or any other meaningful constraint we must consider. /// /// The graph is only mutable by this class, and is generally not modified /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can /// mutate it. class SplitGraph { public: class Node; enum class EdgeKind : uint8_t { /// The nodes are related through a direct call. This is a "strong" edge as /// it means the Src will directly reference the Dst. DirectCall, /// The nodes are related through an indirect call. /// This is a "weaker" edge and is only considered when traversing the graph /// starting from a kernel. We need this edge for resource usage analysis. /// /// The reason why we have this edge in the first place is due to how /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call, /// the resource usage of the kernel containing the indirect call is the /// max resource usage of all functions that can be indirectly called. IndirectCall, }; /// An edge between two nodes. Edges are directional, and tagged with a /// "kind". struct Edge { Edge(Node *Src, Node *Dst, EdgeKind Kind) : Src(Src), Dst(Dst), Kind(Kind) {} Node *Src; ///< Source Node *Dst; ///< Destination EdgeKind Kind; }; using EdgesVec = SmallVector; using edges_iterator = EdgesVec::const_iterator; using nodes_iterator = const Node *const *; SplitGraph(const Module &M, const FunctionsCostMap &CostMap, CostType ModuleCost) : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {} void buildGraph(CallGraph &CG); #ifndef NDEBUG bool verifyGraph() const; #endif bool empty() const { return Nodes.empty(); } const iterator_range nodes() const { return {Nodes.begin(), Nodes.end()}; } const Node &getNode(unsigned ID) const { return *Nodes[ID]; } unsigned getNumNodes() const { return Nodes.size(); } BitVector createNodesBitVector() const { return BitVector(Nodes.size()); } const Module &getModule() const { return M; } CostType getModuleCost() const { return ModuleCost; } CostType getCost(const Function &F) const { return CostMap.at(&F); } /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node /// IDs). CostType calculateCost(const BitVector &BV) const; private: /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and /// updates \p Cache. Node &getNode(DenseMap &Cache, const GlobalValue &GV); // Create a new edge between two nodes and add it to both nodes. const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK); const Module &M; const FunctionsCostMap &CostMap; CostType ModuleCost; // Final list of nodes with stable ordering. SmallVector Nodes; SpecificBumpPtrAllocator NodesPool; // Edges are trivially destructible objects, so as a small optimization we // use a BumpPtrAllocator which avoids destructor calls but also makes // allocation faster. static_assert( std::is_trivially_destructible_v, "Edge must be trivially destructible to use the BumpPtrAllocator"); BumpPtrAllocator EdgesPool; }; /// Nodes in the SplitGraph contain both incoming, and outgoing edges. /// Incoming edges have this node as their Dst, and Outgoing ones have this node /// as their Src. /// /// Edge objects are shared by both nodes in Src/Dst. They provide immediate /// feedback on how two nodes are related, and in which direction they are /// related, which is valuable information to make splitting decisions. /// /// Nodes are fundamentally abstract, and any consumers of the graph should /// treat them as such. While a node will be a function most of the time, we /// could also create nodes for any other reason. In the future, we could have /// single nodes for multiple functions, or nodes for GVs, etc. class SplitGraph::Node { friend class SplitGraph; public: Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost, bool IsNonCopyable) : ID(ID), GV(GV), IndividualCost(IndividualCost), IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) { if (auto *Fn = dyn_cast(&GV)) IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv()); } /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of /// nodes in the graph. This ID can be used as an index in a BitVector. unsigned getID() const { return ID; } const Function &getFunction() const { return cast(GV); } /// \returns the cost to import this component into a given module, not /// accounting for any dependencies that may need to be imported as well. CostType getIndividualCost() const { return IndividualCost; } bool isNonCopyable() const { return IsNonCopyable; } bool isEntryFunctionCC() const { return IsEntryFnCC; } /// \returns whether this is an entry point in the graph. Entry points are /// defined as follows: if you take all entry points in the graph, and iterate /// their dependencies, you are guaranteed to visit all nodes in the graph at /// least once. bool isGraphEntryPoint() const { return IsGraphEntry; } StringRef getName() const { return GV.getName(); } bool hasAnyIncomingEdges() const { return IncomingEdges.size(); } bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const { return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; }); } bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); } bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const { return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; }); } iterator_range incoming_edges() const { return IncomingEdges; } iterator_range outgoing_edges() const { return OutgoingEdges; } bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); } /// Visit all children of this node in a recursive fashion. Also visits Self. /// If \ref shouldFollowIndirectCalls returns false, then this only follows /// DirectCall edges. /// /// \param Visitor Visitor Function. void visitAllDependencies(std::function Visitor) const; /// Adds the depedencies of this node in \p BV by setting the bit /// corresponding to each node. /// /// Implemented using \ref visitAllDependencies, hence it follows the same /// rules regarding dependencies traversal. /// /// \param[out] BV The bitvector where the bits should be set. void getDependencies(BitVector &BV) const { visitAllDependencies([&](const Node &N) { BV.set(N.getID()); }); } private: void markAsGraphEntry() { IsGraphEntry = true; } unsigned ID; const GlobalValue &GV; CostType IndividualCost; bool IsNonCopyable : 1; bool IsEntryFnCC : 1; bool IsGraphEntry : 1; // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped // together) EdgesVec IncomingEdges; EdgesVec OutgoingEdges; }; void SplitGraph::Node::visitAllDependencies( std::function Visitor) const { const bool FollowIndirect = shouldFollowIndirectCalls(); // FIXME: If this can access SplitGraph in the future, use a BitVector // instead. DenseSet Seen; SmallVector WorkList({this}); while (!WorkList.empty()) { const Node *CurN = WorkList.pop_back_val(); if (auto [It, Inserted] = Seen.insert(CurN); !Inserted) continue; Visitor(*CurN); for (const Edge *E : CurN->outgoing_edges()) { if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall) continue; WorkList.push_back(E->Dst); } } } /// Checks if \p I has MD_callees and if it does, parse it and put the function /// in \p Callees. /// /// \returns true if there was metadata and it was parsed correctly. false if /// there was no MD or if it contained unknown entries and parsing failed. /// If this returns false, \p Callees will contain incomplete information /// and must not be used. static bool handleCalleesMD(const Instruction &I, SetVector &Callees) { auto *MD = I.getMetadata(LLVMContext::MD_callees); if (!MD) return false; for (const auto &Op : MD->operands()) { Function *Callee = mdconst::extract_or_null(Op); if (!Callee) return false; Callees.insert(Callee); } return true; } void SplitGraph::buildGraph(CallGraph &CG) { SplitModuleTimer SMT("buildGraph", "graph construction"); LLVM_DEBUG( dbgs() << "[build graph] constructing graph representation of the input\n"); // FIXME(?): Is the callgraph really worth using if we have to iterate the // function again whenever it fails to give us enough information? // We build the graph by just iterating all functions in the module and // working on their direct callees. At the end, all nodes should be linked // together as expected. DenseMap Cache; SmallVector FnsWithIndirectCalls, IndirectlyCallableFns; for (const Function &Fn : M) { if (Fn.isDeclaration()) continue; // Look at direct callees and create the necessary edges in the graph. SetVector DirectCallees; bool CallsExternal = false; for (auto &CGEntry : *CG[&Fn]) { auto *CGNode = CGEntry.second; if (auto *Callee = CGNode->getFunction()) { if (!Callee->isDeclaration()) DirectCallees.insert(Callee); } else if (CGNode == CG.getCallsExternalNode()) CallsExternal = true; } // Keep track of this function if it contains an indirect call and/or if it // can be indirectly called. if (CallsExternal) { LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for "; Fn.printAsOperand(dbgs()); dbgs() << " - analyzing function\n"); SetVector KnownCallees; bool HasUnknownIndirectCall = false; for (const auto &Inst : instructions(Fn)) { // look at all calls without a direct callee. const auto *CB = dyn_cast(&Inst); if (!CB || CB->getCalledFunction()) continue; // inline assembly can be ignored, unless InlineAsmIsIndirectCall is // true. if (CB->isInlineAsm()) { LLVM_DEBUG(dbgs() << " found inline assembly\n"); continue; } if (handleCalleesMD(Inst, KnownCallees)) continue; // If we failed to parse any !callees MD, or some was missing, // the entire KnownCallees list is now unreliable. KnownCallees.clear(); // Everything else is handled conservatively. If we fall into the // conservative case don't bother analyzing further. HasUnknownIndirectCall = true; break; } if (HasUnknownIndirectCall) { LLVM_DEBUG(dbgs() << " indirect call found\n"); FnsWithIndirectCalls.push_back(&Fn); } else if (!KnownCallees.empty()) DirectCallees.insert_range(KnownCallees); } Node &N = getNode(Cache, Fn); for (const auto *Callee : DirectCallees) createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall); if (canBeIndirectlyCalled(Fn)) IndirectlyCallableFns.push_back(&Fn); } // Post-process functions with indirect calls. for (const Function *Fn : FnsWithIndirectCalls) { for (const Function *Candidate : IndirectlyCallableFns) { Node &Src = getNode(Cache, *Fn); Node &Dst = getNode(Cache, *Candidate); createEdge(Src, Dst, EdgeKind::IndirectCall); } } // Now, find all entry points. SmallVector CandidateEntryPoints; BitVector NodesReachableByKernels = createNodesBitVector(); for (Node *N : Nodes) { // Functions with an Entry CC are always graph entry points too. if (N->isEntryFunctionCC()) { N->markAsGraphEntry(); N->getDependencies(NodesReachableByKernels); } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall)) CandidateEntryPoints.push_back(N); } for (Node *N : CandidateEntryPoints) { // This can be another entry point if it's not reachable by a kernel // TODO: We could sort all of the possible new entries in a stable order // (e.g. by cost), then consume them one by one until // NodesReachableByKernels is all 1s. It'd allow us to avoid // considering some nodes as non-entries in some specific cases. if (!NodesReachableByKernels.test(N->getID())) N->markAsGraphEntry(); } #ifndef NDEBUG assert(verifyGraph()); #endif } #ifndef NDEBUG bool SplitGraph::verifyGraph() const { unsigned ExpectedID = 0; // Exceptionally using a set here in case IDs are messed up. DenseSet SeenNodes; DenseSet SeenFunctionNodes; for (const Node *N : Nodes) { if (N->getID() != (ExpectedID++)) { errs() << "Node IDs are incorrect!\n"; return false; } if (!SeenNodes.insert(N).second) { errs() << "Node seen more than once!\n"; return false; } if (&getNode(N->getID()) != N) { errs() << "getNode doesn't return the right node\n"; return false; } for (const Edge *E : N->IncomingEdges) { if (!E->Src || !E->Dst || (E->Dst != N) || (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) { errs() << "ill-formed incoming edges\n"; return false; } } for (const Edge *E : N->OutgoingEdges) { if (!E->Src || !E->Dst || (E->Src != N) || (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) { errs() << "ill-formed outgoing edges\n"; return false; } } const Function &Fn = N->getFunction(); if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) { if (N->hasAnyIncomingEdges()) { errs() << "Kernels cannot have incoming edges\n"; return false; } } if (Fn.isDeclaration()) { errs() << "declarations shouldn't have nodes!\n"; return false; } auto [It, Inserted] = SeenFunctionNodes.insert(&Fn); if (!Inserted) { errs() << "one function has multiple nodes!\n"; return false; } } if (ExpectedID != Nodes.size()) { errs() << "Node IDs out of sync!\n"; return false; } if (createNodesBitVector().size() != getNumNodes()) { errs() << "nodes bit vector doesn't have the right size!\n"; return false; } // Check we respect the promise of Node::isKernel BitVector BV = createNodesBitVector(); for (const Node *N : nodes()) { if (N->isGraphEntryPoint()) N->getDependencies(BV); } // Ensure each function in the module has an associated node. for (const auto &Fn : M) { if (!Fn.isDeclaration()) { if (!SeenFunctionNodes.contains(&Fn)) { errs() << "Fn has no associated node in the graph!\n"; return false; } } } if (!BV.all()) { errs() << "not all nodes are reachable through the graph's entry points!\n"; return false; } return true; } #endif CostType SplitGraph::calculateCost(const BitVector &BV) const { CostType Cost = 0; for (unsigned NodeID : BV.set_bits()) Cost += getNode(NodeID).getIndividualCost(); return Cost; } SplitGraph::Node & SplitGraph::getNode(DenseMap &Cache, const GlobalValue &GV) { auto &N = Cache[&GV]; if (N) return *N; CostType Cost = 0; bool NonCopyable = false; if (const Function *Fn = dyn_cast(&GV)) { NonCopyable = isNonCopyable(*Fn); Cost = CostMap.at(Fn); } N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable); Nodes.push_back(N); assert(&getNode(N->getID()) == N); return *N; } const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst, EdgeKind EK) { const Edge *E = new (EdgesPool.Allocate(1)) Edge(&Src, &Dst, EK); Src.OutgoingEdges.push_back(E); Dst.IncomingEdges.push_back(E); return *E; } //===----------------------------------------------------------------------===// // Split Proposals //===----------------------------------------------------------------------===// /// Represents a module splitting proposal. /// /// Proposals are made of N BitVectors, one for each partition, where each bit /// set indicates that the node is present and should be copied inside that /// partition. /// /// Proposals have several metrics attached so they can be compared/sorted, /// which the driver to try multiple strategies resultings in multiple proposals /// and choose the best one out of them. class SplitProposal { public: SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) { Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()}); } void setName(StringRef NewName) { Name = NewName; } StringRef getName() const { return Name; } const BitVector &operator[](unsigned PID) const { return Partitions[PID].second; } void add(unsigned PID, const BitVector &BV) { Partitions[PID].second |= BV; updateScore(PID); } void print(raw_ostream &OS) const; LLVM_DUMP_METHOD void dump() const { print(dbgs()); } // Find the cheapest partition (lowest cost). In case of ties, always returns // the highest partition number. unsigned findCheapestPartition() const; /// Calculate the CodeSize and Bottleneck scores. void calculateScores(); #ifndef NDEBUG void verifyCompleteness() const; #endif /// Only available after \ref calculateScores is called. /// /// A positive number indicating the % of code duplication that this proposal /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by /// duplicating some functions across partitions. /// /// Value is always rounded up to 3 decimal places. /// /// A perfect score would be 0.0, and anything approaching 1.0 is very bad. double getCodeSizeScore() const { return CodeSizeScore; } /// Only available after \ref calculateScores is called. /// /// A number between [0, 1] which indicates how big of a bottleneck is /// expected from the largest partition. /// /// A score of 1.0 means the biggest partition is as big as the source module, /// so build time will be equal to or greater than the build time of the /// initial input. /// /// Value is always rounded up to 3 decimal places. /// /// This is one of the metrics used to estimate this proposal's build time. double getBottleneckScore() const { return BottleneckScore; } private: void updateScore(unsigned PID) { assert(SG); for (auto &[PCost, Nodes] : Partitions) { TotalCost -= PCost; PCost = SG->calculateCost(Nodes); TotalCost += PCost; } } /// \see getCodeSizeScore double CodeSizeScore = 0.0; /// \see getBottleneckScore double BottleneckScore = 0.0; /// Aggregated cost of all partitions CostType TotalCost = 0; const SplitGraph *SG = nullptr; std::string Name; std::vector> Partitions; }; void SplitProposal::print(raw_ostream &OS) const { assert(SG); OS << "[proposal] " << Name << ", total cost:" << TotalCost << ", code size score:" << format("%0.3f", CodeSizeScore) << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n'; for (const auto &[PID, Part] : enumerate(Partitions)) { const auto &[Cost, NodeIDs] = Part; OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n"; } } unsigned SplitProposal::findCheapestPartition() const { assert(!Partitions.empty()); CostType CurCost = std::numeric_limits::max(); unsigned CurPID = InvalidPID; for (const auto &[Idx, Part] : enumerate(Partitions)) { if (Part.first <= CurCost) { CurPID = Idx; CurCost = Part.first; } } assert(CurPID != InvalidPID); return CurPID; } void SplitProposal::calculateScores() { if (Partitions.empty()) return; assert(SG); CostType LargestPCost = 0; for (auto &[PCost, Nodes] : Partitions) { if (PCost > LargestPCost) LargestPCost = PCost; } CostType ModuleCost = SG->getModuleCost(); CodeSizeScore = double(TotalCost) / ModuleCost; assert(CodeSizeScore >= 0.0); BottleneckScore = double(LargestPCost) / ModuleCost; CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0; BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0; } #ifndef NDEBUG void SplitProposal::verifyCompleteness() const { if (Partitions.empty()) return; BitVector Result = Partitions[0].second; for (const auto &P : drop_begin(Partitions)) Result |= P.second; assert(Result.all() && "some nodes are missing from this proposal!"); } #endif //===-- RecursiveSearchStrategy -------------------------------------------===// /// Partitioning algorithm. /// /// This is a recursive search algorithm that can explore multiple possiblities. /// /// When a cluster of nodes can go into more than one partition, and we haven't /// reached maximum search depth, we recurse and explore both options and their /// consequences. Both branches will yield a proposal, and the driver will grade /// both and choose the best one. /// /// If max depth is reached, we will use some heuristics to make a choice. Most /// of the time we will just use the least-pressured (cheapest) partition, but /// if a cluster is particularly big and there is a good amount of overlap with /// an existing partition, we will choose that partition instead. class RecursiveSearchSplitting { public: using SubmitProposalFn = function_ref; RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal); void run(); private: struct WorkListEntry { WorkListEntry(const BitVector &BV) : Cluster(BV) {} unsigned NumNonEntryNodes = 0; CostType TotalCost = 0; CostType CostExcludingGraphEntryPoints = 0; BitVector Cluster; }; /// Collects all graph entry points's clusters and sort them so the most /// expensive clusters are viewed first. This will merge clusters together if /// they share a non-copyable dependency. void setupWorkList(); /// Recursive function that assigns the worklist item at \p Idx into a /// partition of \p SP. /// /// \p Depth is the current search depth. When this value is equal to /// \ref MaxDepth, we can no longer recurse. /// /// This function only recurses if there is more than one possible assignment, /// otherwise it is iterative to avoid creating a call stack that is as big as /// \ref WorkList. void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP); /// \return A pair: first element is the PID of the partition that has the /// most similarities with \p Entry, or \ref InvalidPID if no partition was /// found with at least one element in common. The second element is the /// aggregated cost of all dependencies in common between \p Entry and that /// partition. std::pair findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP); const SplitGraph &SG; unsigned NumParts; SubmitProposalFn SubmitProposal; // A Cluster is considered large when its cost, excluding entry points, // exceeds this value. CostType LargeClusterThreshold = 0; unsigned NumProposalsSubmitted = 0; SmallVector WorkList; }; RecursiveSearchSplitting::RecursiveSearchSplitting( const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal) : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) { // arbitrary max value as a safeguard. Anything above 10 will already be // slow, this is just a max value to prevent extreme resource exhaustion or // unbounded run time. if (MaxDepth > 16) report_fatal_error("[amdgpu-split-module] search depth of " + Twine(MaxDepth) + " is too high!"); LargeClusterThreshold = (LargeFnFactor != 0.0) ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor)) : std::numeric_limits::max(); LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at " << LargeClusterThreshold << "\n"); } void RecursiveSearchSplitting::run() { { SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist"); setupWorkList(); } { SplitModuleTimer SMT("recursive_search_pick", "partitioning"); SplitProposal SP(SG, NumParts); pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP); } } void RecursiveSearchSplitting::setupWorkList() { // e.g. if A and B are two worklist item, and they both call a non copyable // dependency C, this does: // A=C // B=C // => NodeEC will create a single group (A, B, C) and we create a new // WorkList entry for that group. EquivalenceClasses NodeEC; for (const SplitGraph::Node *N : SG.nodes()) { if (!N->isGraphEntryPoint()) continue; NodeEC.insert(N->getID()); N->visitAllDependencies([&](const SplitGraph::Node &Dep) { if (&Dep != N && Dep.isNonCopyable()) NodeEC.unionSets(N->getID(), Dep.getID()); }); } for (const auto &Node : NodeEC) { if (!Node->isLeader()) continue; BitVector Cluster = SG.createNodesBitVector(); for (unsigned M : NodeEC.members(*Node)) { const SplitGraph::Node &N = SG.getNode(M); if (N.isGraphEntryPoint()) N.getDependencies(Cluster); } WorkList.emplace_back(std::move(Cluster)); } // Calculate costs and other useful information. for (WorkListEntry &Entry : WorkList) { for (unsigned NodeID : Entry.Cluster.set_bits()) { const SplitGraph::Node &N = SG.getNode(NodeID); const CostType Cost = N.getIndividualCost(); Entry.TotalCost += Cost; if (!N.isGraphEntryPoint()) { Entry.CostExcludingGraphEntryPoints += Cost; ++Entry.NumNonEntryNodes; } } } stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) { if (A.TotalCost != B.TotalCost) return A.TotalCost > B.TotalCost; if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints) return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints; if (A.NumNonEntryNodes != B.NumNonEntryNodes) return A.NumNonEntryNodes > B.NumNonEntryNodes; return A.Cluster.count() > B.Cluster.count(); }); LLVM_DEBUG({ dbgs() << "[recursive search] worklist:\n"; for (const auto &[Idx, Entry] : enumerate(WorkList)) { dbgs() << " - [" << Idx << "]: "; for (unsigned NodeID : Entry.Cluster.set_bits()) dbgs() << NodeID << " "; dbgs() << "(total_cost:" << Entry.TotalCost << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints << ")\n"; } }); } void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP) { while (Idx < WorkList.size()) { // Step 1: Determine candidate PIDs. // const WorkListEntry &Entry = WorkList[Idx]; const BitVector &Cluster = Entry.Cluster; // Default option is to do load-balancing, AKA assign to least pressured // partition. const unsigned CheapestPID = SP.findCheapestPartition(); assert(CheapestPID != InvalidPID); // Explore assigning to the kernel that contains the most dependencies in // common. const auto [MostSimilarPID, SimilarDepsCost] = findMostSimilarPartition(Entry, SP); // We can chose to explore only one path if we only have one valid path, or // if we reached maximum search depth and can no longer branch out. unsigned SinglePIDToTry = InvalidPID; if (MostSimilarPID == InvalidPID) // no similar PID found SinglePIDToTry = CheapestPID; else if (MostSimilarPID == CheapestPID) // both landed on the same PID SinglePIDToTry = CheapestPID; else if (Depth >= MaxDepth) { // We have to choose one path. Use a heuristic to guess which one will be // more appropriate. if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) { // Check if the amount of code in common makes it worth it. assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints); const double Ratio = static_cast(SimilarDepsCost) / Entry.CostExcludingGraphEntryPoints; assert(Ratio >= 0.0 && Ratio <= 1.0); if (Ratio > LargeFnOverlapForMerge) { // For debug, just print "L", so we'll see "L3=P3" for instance, which // will mean we reached max depth and chose P3 based on this // heuristic. LLVM_DEBUG(dbgs() << 'L'); SinglePIDToTry = MostSimilarPID; } } else SinglePIDToTry = CheapestPID; } // Step 2: Explore candidates. // When we only explore one possible path, and thus branch depth doesn't // increase, do not recurse, iterate instead. if (SinglePIDToTry != InvalidPID) { LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' '); // Only one path to explore, don't clone SP, don't increase depth. SP.add(SinglePIDToTry, Cluster); ++Idx; continue; } assert(MostSimilarPID != InvalidPID); // We explore multiple paths: recurse at increased depth, then stop this // function. LLVM_DEBUG(dbgs() << '\n'); // lb = load balancing = put in cheapest partition { SplitProposal BranchSP = SP; LLVM_DEBUG(dbgs().indent(Depth) << " [lb] " << Idx << "=P" << CheapestPID << "? "); BranchSP.add(CheapestPID, Cluster); pickPartition(Depth + 1, Idx + 1, BranchSP); } // ms = most similar = put in partition with the most in common { SplitProposal BranchSP = SP; LLVM_DEBUG(dbgs().indent(Depth) << " [ms] " << Idx << "=P" << MostSimilarPID << "? "); BranchSP.add(MostSimilarPID, Cluster); pickPartition(Depth + 1, Idx + 1, BranchSP); } return; } // Step 3: If we assigned all WorkList items, submit the proposal. assert(Idx == WorkList.size()); assert(NumProposalsSubmitted <= (2u << MaxDepth) && "Search got out of bounds?"); SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" + std::to_string(NumProposalsSubmitted++)); LLVM_DEBUG(dbgs() << '\n'); SubmitProposal(SP); } std::pair RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP) { if (!Entry.NumNonEntryNodes) return {InvalidPID, 0}; // We take the partition that is the most similar using Cost as a metric. // So we take the set of nodes in common, compute their aggregated cost, and // pick the partition with the highest cost in common. unsigned ChosenPID = InvalidPID; CostType ChosenCost = 0; for (unsigned PID = 0; PID < NumParts; ++PID) { BitVector BV = SP[PID]; BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?! if (BV.none()) continue; const CostType Cost = SG.calculateCost(BV); if (ChosenPID == InvalidPID || ChosenCost < Cost || (ChosenCost == Cost && PID > ChosenPID)) { ChosenPID = PID; ChosenCost = Cost; } } return {ChosenPID, ChosenCost}; } //===----------------------------------------------------------------------===// // DOTGraph Printing Support //===----------------------------------------------------------------------===// const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { return E->Dst; } using SplitGraphEdgeDstIterator = mapped_iterator; } // namespace template <> struct GraphTraits { using NodeRef = const SplitGraph::Node *; using nodes_iterator = SplitGraph::nodes_iterator; using ChildIteratorType = SplitGraphEdgeDstIterator; using EdgeRef = const SplitGraph::Edge *; using ChildEdgeIteratorType = SplitGraph::edges_iterator; static NodeRef getEntryNode(NodeRef N) { return N; } static ChildIteratorType child_begin(NodeRef Ref) { return {Ref->outgoing_edges().begin(), mapEdgeToDst}; } static ChildIteratorType child_end(NodeRef Ref) { return {Ref->outgoing_edges().end(), mapEdgeToDst}; } static nodes_iterator nodes_begin(const SplitGraph &G) { return G.nodes().begin(); } static nodes_iterator nodes_end(const SplitGraph &G) { return G.nodes().end(); } }; template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} static std::string getGraphName(const SplitGraph &SG) { return SG.getModule().getName().str(); } std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) { return N->getName().str(); } static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG) { std::string Result; if (N->isEntryFunctionCC()) Result += "entry-fn-cc "; if (N->isNonCopyable()) Result += "non-copyable "; Result += "cost:" + std::to_string(N->getIndividualCost()); return Result; } static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG) { return N->hasAnyIncomingEdges() ? "" : "color=\"red\""; } static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG) { switch ((*EI.getCurrent())->Kind) { case SplitGraph::EdgeKind::DirectCall: return ""; case SplitGraph::EdgeKind::IndirectCall: return "style=\"dashed\""; } llvm_unreachable("Unknown SplitGraph::EdgeKind enum"); } }; //===----------------------------------------------------------------------===// // Driver //===----------------------------------------------------------------------===// namespace { // If we didn't externalize GVs, then local GVs need to be conservatively // imported into every module (including their initializers), and then cleaned // up afterwards. static bool needsConservativeImport(const GlobalValue *GV) { if (const auto *Var = dyn_cast(GV)) return Var->hasLocalLinkage(); return isa(GV); } /// Prints a summary of the partition \p N, represented by module \p M, to \p /// OS. static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M, unsigned PartCost, unsigned ModuleCost) { OS << "*** Partition P" << N << " ***\n"; for (const auto &Fn : M) { if (!Fn.isDeclaration()) OS << " - [function] " << Fn.getName() << "\n"; } for (const auto &GV : M.globals()) { if (GV.hasInitializer()) OS << " - [global] " << GV.getName() << "\n"; } OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost) << "% of the source\n"; } static void evaluateProposal(SplitProposal &Best, SplitProposal New) { SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm"); LLVM_DEBUG({ New.verifyCompleteness(); if (DebugProposalSearch) New.print(dbgs()); }); const double CurBScore = Best.getBottleneckScore(); const double CurCSScore = Best.getCodeSizeScore(); const double NewBScore = New.getBottleneckScore(); const double NewCSScore = New.getCodeSizeScore(); // TODO: Improve this // We can probably lower the precision of the comparison at first // e.g. if we have // - (Current): BScore: 0.489 CSCore 1.105 // - (New): BScore: 0.475 CSCore 1.305 // Currently we'd choose the new one because the bottleneck score is // lower, but the new one duplicates more code. It may be worth it to // discard the new proposal as the impact on build time is negligible. // Compare them bool IsBest = false; if (NewBScore < CurBScore) IsBest = true; else if (NewBScore == CurBScore) IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker. if (IsBest) Best = std::move(New); LLVM_DEBUG(if (DebugProposalSearch) { if (IsBest) dbgs() << "[search] new best proposal!\n"; else dbgs() << "[search] discarding - not profitable\n"; }); } /// Trivial helper to create an identical copy of \p M. static std::unique_ptr cloneAll(const Module &M) { ValueToValueMapTy VMap; return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; }); } /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested. static void writeDOTGraph(const SplitGraph &SG) { if (ModuleDotCfgOutput.empty()) return; std::error_code EC; raw_fd_ostream OS(ModuleDotCfgOutput, EC); if (EC) { errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput << "' - DOTGraph will not be printed\n"; } WriteGraph(OS, SG, /*ShortName=*/false, /*Title=*/SG.getModule().getName()); } static void splitAMDGPUModule( GetTTIFn GetTTI, Module &M, unsigned NumParts, function_ref MPart)> ModuleCallback) { CallGraph CG(M); // Externalize functions whose address are taken. // // This is needed because partitioning is purely based on calls, but sometimes // a kernel/function may just look at the address of another local function // and not do anything (no calls). After partitioning, that local function may // end up in a different module (so it's just a declaration in the module // where its address is taken), which emits a "undefined hidden symbol" linker // error. // // Additionally, it guides partitioning to not duplicate this function if it's // called directly at some point. // // TODO: Could we be smarter about this ? This makes all functions whose // addresses are taken non-copyable. We should probably model this type of // constraint in the graph and use it to guide splitting, instead of // externalizing like this. Maybe non-copyable should really mean "keep one // visible copy, then internalize all other copies" for some functions? if (!NoExternalizeOnAddrTaken) { for (auto &Fn : M) { // TODO: Should aliases count? Probably not but they're so rare I'm not // sure it's worth fixing. if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) { LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs()); dbgs() << " because its address is taken\n"); externalize(Fn); } } } // Externalize local GVs, which avoids duplicating their initializers, which // in turns helps keep code size in check. if (!NoExternalizeGlobals) { for (auto &GV : M.globals()) { if (GV.hasLocalLinkage()) LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n'); externalize(GV); } } // Start by calculating the cost of every function in the module, as well as // the module's overall cost. FunctionsCostMap FnCosts; const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts); // Build the SplitGraph, which represents the module's functions and models // their dependencies accurately. SplitGraph SG(M, FnCosts, ModuleCost); SG.buildGraph(CG); if (SG.empty()) { LLVM_DEBUG( dbgs() << "[!] no nodes in graph, input is empty - no splitting possible\n"); ModuleCallback(cloneAll(M)); return; } LLVM_DEBUG({ dbgs() << "[graph] nodes:\n"; for (const SplitGraph::Node *N : SG.nodes()) { dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " << (N->isGraphEntryPoint() ? "(entry)" : "") << " " << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n"; } }); writeDOTGraph(SG); LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n"); std::optional Proposal; const auto EvaluateProposal = [&](SplitProposal SP) { SP.calculateScores(); if (!Proposal) Proposal = std::move(SP); else evaluateProposal(*Proposal, std::move(SP)); }; // TODO: It would be very easy to create new strategies by just adding a base // class to RecursiveSearchSplitting and abstracting it away. RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run(); LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: " << Proposal->getName() << "\n";); if (!Proposal) { LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n"); ModuleCallback(cloneAll(M)); return; } LLVM_DEBUG(Proposal->print(dbgs());); std::optional SummariesOS; if (!PartitionSummariesOutput.empty()) { std::error_code EC; SummariesOS.emplace(PartitionSummariesOutput, EC); if (EC) errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput << "' - Partition summaries will not be printed\n"; } // One module will import all GlobalValues that are not Functions // and are not subject to conservative import. bool ImportAllGVs = true; for (unsigned PID = 0; PID < NumParts; ++PID) { SplitModuleTimer SMT2("modules_creation", "creating modules for each partition"); LLVM_DEBUG(dbgs() << "[split] creating new modules\n"); DenseSet FnsInPart; for (unsigned NodeID : (*Proposal)[PID].set_bits()) FnsInPart.insert(&SG.getNode(NodeID).getFunction()); // Don't create empty modules. if (FnsInPart.empty()) { LLVM_DEBUG(dbgs() << "[split] P" << PID << " is empty, not creating module\n"); continue; } ValueToValueMapTy VMap; CostType PartCost = 0; std::unique_ptr MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. if (const auto *Fn = dyn_cast(GV)) { if (FnsInPart.contains(Fn)) { PartCost += SG.getCost(*Fn); return true; } return false; } // Everything else goes in the first non-empty module we create. return ImportAllGVs || needsConservativeImport(GV); })); ImportAllGVs = false; // FIXME: Aliases aren't seen often, and their handling isn't perfect so // bugs are possible. // Clean-up conservatively imported GVs without any users. for (auto &GV : make_early_inc_range(MPart->global_values())) { if (needsConservativeImport(&GV) && GV.use_empty()) GV.eraseFromParent(); } if (SummariesOS) printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost); LLVM_DEBUG( printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost)); ModuleCallback(std::move(MPart)); } } } // namespace PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, ModuleAnalysisManager &MAM) { SplitModuleTimer SMT( "total", "total pass runtime (incl. potentially waiting for lockfile)"); FunctionAnalysisManager &FAM = MAM.getResult(M).getManager(); const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { return FAM.getResult(F); }; bool Done = false; #ifndef NDEBUG if (UseLockFile) { SmallString<128> LockFilePath; sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath); sys::path::append(LockFilePath, "amdgpu-split-module-debug"); LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath << "'\n"); while (true) { llvm::LockFileManager Lock(LockFilePath.str()); bool Owned; if (Error Err = Lock.tryLock().moveInto(Owned)) { consumeError(std::move(Err)); LLVM_DEBUG( dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " "output may be mangled by other processes\n"); } else if (!Owned) { switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) { case WaitForUnlockResult::Success: break; case WaitForUnlockResult::OwnerDied: continue; // try again to get the lock. case WaitForUnlockResult::Timeout: LLVM_DEBUG( dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " "output may be mangled by other processes\n"); Lock.unsafeMaybeUnlock(); break; // give up } } splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); Done = true; break; } } #endif if (!Done) splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); // We can change linkage/visibilities in the input, consider that nothing is // preserved just to be safe. This pass runs last anyway. return PreservedAnalyses::none(); } } // namespace llvm