summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/IPO/PartialInlining.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/IPO/PartialInlining.cpp')
-rw-r--r--llvm/lib/Transforms/IPO/PartialInlining.cpp1531
1 files changed, 1531 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
new file mode 100644
index 0000000000000..e193074884af6
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -0,0 +1,1531 @@
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partial-inlining"
+
+STATISTIC(NumPartialInlined,
+ "Number of callsites functions partially inlined into.");
+STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
+ "cold outlined regions were partially "
+ "inlined into its caller(s).");
+STATISTIC(NumColdRegionsFound,
+ "Number of cold single entry/exit regions found.");
+STATISTIC(NumColdRegionsOutlined,
+ "Number of cold single entry/exit regions outlined.");
+
+// Command line option to disable partial-inlining. The default is false:
+static cl::opt<bool>
+ DisablePartialInlining("disable-partial-inlining", cl::init(false),
+ cl::Hidden, cl::desc("Disable partial inlining"));
+// Command line option to disable multi-region partial-inlining. The default is
+// false:
+static cl::opt<bool> DisableMultiRegionPartialInline(
+ "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
+ cl::desc("Disable multi-region partial inlining"));
+
+// Command line option to force outlining in regions with live exit variables.
+// The default is false:
+static cl::opt<bool>
+ ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
+ cl::desc("Force outline regions with live exits"));
+
+// Command line option to enable marking outline functions with Cold Calling
+// Convention. The default is false:
+static cl::opt<bool>
+ MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
+ cl::desc("Mark outline function calls with ColdCC"));
+
+#ifndef NDEBUG
+// Command line option to debug partial-inlining. The default is none:
+static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
+ cl::init(false), cl::Hidden,
+ cl::desc("Trace partial inlining."));
+#endif
+
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
+// Used to determine if a cold region is worth outlining based on
+// its inlining cost compared to the original function. Default is set at 10%.
+// ie. if the cold region reduces the inlining cost of the original function by
+// at least 10%.
+static cl::opt<float> MinRegionSizeRatio(
+ "min-region-size-ratio", cl::init(0.1), cl::Hidden,
+ cl::desc("Minimum ratio comparing relative sizes of each "
+ "outline candidate and original function"));
+// Used to tune the minimum number of execution counts needed in the predecessor
+// block to the cold edge. ie. confidence interval.
+static cl::opt<unsigned>
+ MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
+ cl::desc("Minimum block executions to consider "
+ "its BranchProbabilityInfo valid"));
+// Used to determine when an edge is considered cold. Default is set to 10%. ie.
+// if the branch probability is 10% or less, then it is deemed as 'cold'.
+static cl::opt<float> ColdBranchRatio(
+ "cold-branch-ratio", cl::init(0.1), cl::Hidden,
+ cl::desc("Minimum BranchProbability to consider a region cold."));
+
+static cl::opt<unsigned> MaxNumInlineBlocks(
+ "max-num-inline-blocks", cl::init(5), cl::Hidden,
+ cl::desc("Max number of blocks to be partially inlined"));
+
+// Command line option to set the maximum number of partial inlining allowed
+// for the module. The default value of -1 means no limit.
+static cl::opt<int> MaxNumPartialInlining(
+ "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max number of partial inlining. The default is unlimited"));
+
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
+static cl::opt<unsigned> ExtraOutliningPenalty(
+ "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+ cl::desc("A debug option to add additional penalty to the computed one."));
+
+namespace {
+
+struct FunctionOutliningInfo {
+ FunctionOutliningInfo() = default;
+
+ // Returns the number of blocks to be inlined including all blocks
+ // in Entries and one return block.
+ unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
+
+ // A set of blocks including the function entry that guard
+ // the region to be outlined.
+ SmallVector<BasicBlock *, 4> Entries;
+
+ // The return block that is not included in the outlined region.
+ BasicBlock *ReturnBlock = nullptr;
+
+ // The dominating block of the region to be outlined.
+ BasicBlock *NonReturnBlock = nullptr;
+
+ // The set of blocks in Entries that that are predecessors to ReturnBlock
+ SmallVector<BasicBlock *, 4> ReturnBlockPreds;
+};
+
+struct FunctionOutliningMultiRegionInfo {
+ FunctionOutliningMultiRegionInfo()
+ : ORI() {}
+
+ // Container for outline regions
+ struct OutlineRegionInfo {
+ OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
+ BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+ BasicBlock *ReturnBlock)
+ : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
+ ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
+ SmallVector<BasicBlock *, 8> Region;
+ BasicBlock *EntryBlock;
+ BasicBlock *ExitBlock;
+ BasicBlock *ReturnBlock;
+ };
+
+ SmallVector<OutlineRegionInfo, 4> ORI;
+};
+
+struct PartialInlinerImpl {
+
+ PartialInlinerImpl(
+ std::function<AssumptionCache &(Function &)> *GetAC,
+ function_ref<AssumptionCache *(Function &)> LookupAC,
+ std::function<TargetTransformInfo &(Function &)> *GTTI,
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
+ ProfileSummaryInfo *ProfSI)
+ : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
+ GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
+
+ bool run(Module &M);
+ // Main part of the transformation that calls helper functions to find
+ // outlining candidates, clone & outline the function, and attempt to
+ // partially inline the resulting function. Returns true if
+ // inlining was successful, false otherwise. Also returns the outline
+ // function (only if we partially inlined early returns) as there is a
+ // possibility to further "peel" early return statements that were left in the
+ // outline function due to code size.
+ std::pair<bool, Function *> unswitchFunction(Function *F);
+
+ // This class speculatively clones the function to be partial inlined.
+ // At the end of partial inlining, the remaining callsites to the cloned
+ // function that are not partially inlined will be fixed up to reference
+ // the original function, and the cloned function will be erased.
+ struct FunctionCloner {
+ // Two constructors, one for single region outlining, the other for
+ // multi-region outlining.
+ FunctionCloner(Function *F, FunctionOutliningInfo *OI,
+ OptimizationRemarkEmitter &ORE,
+ function_ref<AssumptionCache *(Function &)> LookupAC);
+ FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
+ OptimizationRemarkEmitter &ORE,
+ function_ref<AssumptionCache *(Function &)> LookupAC);
+ ~FunctionCloner();
+
+ // Prepare for function outlining: making sure there is only
+ // one incoming edge from the extracted/outlined region to
+ // the return block.
+ void NormalizeReturnBlock();
+
+ // Do function outlining for cold regions.
+ bool doMultiRegionFunctionOutlining();
+ // Do function outlining for region after early return block(s).
+ // NOTE: For vararg functions that do the vararg handling in the outlined
+ // function, we temporarily generate IR that does not properly
+ // forward varargs to the outlined function. Calling InlineFunction
+ // will update calls to the outlined functions to properly forward
+ // the varargs.
+ Function *doSingleRegionFunctionOutlining();
+
+ Function *OrigFunc = nullptr;
+ Function *ClonedFunc = nullptr;
+
+ typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
+ // Keep track of Outlined Functions and the basic block they're called from.
+ SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
+
+ // ClonedFunc is inlined in one of its callers after function
+ // outlining.
+ bool IsFunctionInlined = false;
+ // The cost of the region to be outlined.
+ int OutlinedRegionCost = 0;
+ // ClonedOI is specific to outlining non-early return blocks.
+ std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+ // ClonedOMRI is specific to outlining cold regions.
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
+ std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+ OptimizationRemarkEmitter &ORE;
+ function_ref<AssumptionCache *(Function &)> LookupAC;
+ };
+
+private:
+ int NumPartialInlining = 0;
+ std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+ function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
+ std::function<TargetTransformInfo &(Function &)> *GetTTI;
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
+ ProfileSummaryInfo *PSI;
+
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
+ BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
+
+ // Return true if the callee of CS should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+ BlockFrequency WeightedOutliningRcost,
+ OptimizationRemarkEmitter &ORE);
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(FunctionCloner &Cloner);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
+ void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &SiteCountMap);
+
+ bool IsLimitReached() {
+ return (MaxNumPartialInlining != -1 &&
+ NumPartialInlining >= MaxNumPartialInlining);
+ }
+
+ static CallSite getCallSite(User *U) {
+ CallSite CS;
+ if (CallInst *CI = dyn_cast<CallInst>(U))
+ CS = CallSite(CI);
+ else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+ CS = CallSite(II);
+ else
+ llvm_unreachable("All uses must be calls");
+ return CS;
+ }
+
+ static CallSite getOneCallSiteTo(Function *F) {
+ User *User = *F->user_begin();
+ return getCallSite(User);
+ }
+
+ std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+ CallSite CS = getOneCallSiteTo(F);
+ DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function, including the addtional setup cost in the
+ // outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block Cloner.OutliningCallBB;
+ std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
+
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
+ static int computeBBInlineCost(BasicBlock *BB);
+
+ std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+ std::unique_ptr<FunctionOutliningMultiRegionInfo>
+ computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
+};
+
+struct PartialInlinerLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PartialInlinerLegacyPass() : ModulePass(ID) {
+ initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+ TargetTransformInfoWrapperPass *TTIWP =
+ &getAnalysis<TargetTransformInfoWrapperPass>();
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&ACT](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+
+ auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
+ return ACT->lookupAssumptionCache(F);
+ };
+
+ std::function<TargetTransformInfo &(Function &)> GetTTI =
+ [&TTIWP](Function &F) -> TargetTransformInfo & {
+ return TTIWP->getTTI(F);
+ };
+
+ return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
+ &GetTTI, NoneType::None, PSI)
+ .run(M);
+ }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionOutliningMultiRegionInfo>
+PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
+ OptimizationRemarkEmitter &ORE) {
+ BasicBlock *EntryBlock = &F->front();
+
+ DominatorTree DT(*F);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*F, LI);
+ std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
+ BlockFrequencyInfo *BFI;
+ if (!GetBFI) {
+ ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
+ BFI = ScopedBFI.get();
+ } else
+ BFI = &(*GetBFI)(*F);
+
+ // Return if we don't have profiling information.
+ if (!PSI->hasInstrumentationProfile())
+ return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
+ std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+ auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
+ BasicBlock *Dom = BlockList.front();
+ return BlockList.size() > 1 && Dom->hasNPredecessors(1);
+ };
+
+ auto IsSingleExit =
+ [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
+ BasicBlock *ExitBlock = nullptr;
+ for (auto *Block : BlockList) {
+ for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
+ if (!is_contained(BlockList, *SI)) {
+ if (ExitBlock) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
+ &SI->front())
+ << "Region dominated by "
+ << ore::NV("Block", BlockList.front()->getName())
+ << " has more than one region exit edge.";
+ });
+ return nullptr;
+ } else
+ ExitBlock = Block;
+ }
+ }
+ }
+ return ExitBlock;
+ };
+
+ auto BBProfileCount = [BFI](BasicBlock *BB) {
+ return BFI->getBlockProfileCount(BB)
+ ? BFI->getBlockProfileCount(BB).getValue()
+ : 0;
+ };
+
+ // Use the same computeBBInlineCost function to compute the cost savings of
+ // the outlining the candidate region.
+ int OverallFunctionCost = 0;
+ for (auto &BB : *F)
+ OverallFunctionCost += computeBBInlineCost(&BB);
+
+#ifndef NDEBUG
+ if (TracePartialInlining)
+ dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
+#endif
+ int MinOutlineRegionCost =
+ static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
+ BranchProbability MinBranchProbability(
+ static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
+ MinBlockCounterExecution);
+ bool ColdCandidateFound = false;
+ BasicBlock *CurrEntry = EntryBlock;
+ std::vector<BasicBlock *> DFS;
+ DenseMap<BasicBlock *, bool> VisitedMap;
+ DFS.push_back(CurrEntry);
+ VisitedMap[CurrEntry] = true;
+ // Use Depth First Search on the basic blocks to find CFG edges that are
+ // considered cold.
+ // Cold regions considered must also have its inline cost compared to the
+ // overall inline cost of the original function. The region is outlined only
+ // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
+ // more.
+ while (!DFS.empty()) {
+ auto *thisBB = DFS.back();
+ DFS.pop_back();
+ // Only consider regions with predecessor blocks that are considered
+ // not-cold (default: part of the top 99.99% of all block counters)
+ // AND greater than our minimum block execution count (default: 100).
+ if (PSI->isColdBlock(thisBB, BFI) ||
+ BBProfileCount(thisBB) < MinBlockCounterExecution)
+ continue;
+ for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
+ if (VisitedMap[*SI])
+ continue;
+ VisitedMap[*SI] = true;
+ DFS.push_back(*SI);
+ // If branch isn't cold, we skip to the next one.
+ BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
+ if (SuccProb > MinBranchProbability)
+ continue;
+#ifndef NDEBUG
+ if (TracePartialInlining) {
+ dbgs() << "Found cold edge: " << thisBB->getName() << "->"
+ << (*SI)->getName() << "\nBranch Probability = " << SuccProb
+ << "\n";
+ }
+#endif
+ SmallVector<BasicBlock *, 8> DominateVector;
+ DT.getDescendants(*SI, DominateVector);
+ // We can only outline single entry regions (for now).
+ if (!IsSingleEntry(DominateVector))
+ continue;
+ BasicBlock *ExitBlock = nullptr;
+ // We can only outline single exit regions (for now).
+ if (!(ExitBlock = IsSingleExit(DominateVector)))
+ continue;
+ int OutlineRegionCost = 0;
+ for (auto *BB : DominateVector)
+ OutlineRegionCost += computeBBInlineCost(BB);
+
+#ifndef NDEBUG
+ if (TracePartialInlining)
+ dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
+#endif
+
+ if (OutlineRegionCost < MinOutlineRegionCost) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
+ &SI->front())
+ << ore::NV("Callee", F) << " inline cost-savings smaller than "
+ << ore::NV("Cost", MinOutlineRegionCost);
+ });
+ continue;
+ }
+ // For now, ignore blocks that belong to a SISE region that is a
+ // candidate for outlining. In the future, we may want to look
+ // at inner regions because the outer region may have live-exit
+ // variables.
+ for (auto *BB : DominateVector)
+ VisitedMap[BB] = true;
+ // ReturnBlock here means the block after the outline call
+ BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
+ // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
+ FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
+ DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
+ OutliningInfo->ORI.push_back(RegInfo);
+#ifndef NDEBUG
+ if (TracePartialInlining) {
+ dbgs() << "Found Cold Candidate starting at block: "
+ << DominateVector.front()->getName() << "\n";
+ }
+#endif
+ ColdCandidateFound = true;
+ NumColdRegionsFound++;
+ }
+ }
+ if (ColdCandidateFound)
+ return OutliningInfo;
+ else
+ return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+}
+
+std::unique_ptr<FunctionOutliningInfo>
+PartialInlinerImpl::computeOutliningInfo(Function *F) {
+ BasicBlock *EntryBlock = &F->front();
+ BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
+ if (!BR || BR->isUnconditional())
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Returns true if Succ is BB's successor
+ auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
+ return is_contained(successors(BB), Succ);
+ };
+
+ auto IsReturnBlock = [](BasicBlock *BB) {
+ Instruction *TI = BB->getTerminator();
+ return isa<ReturnInst>(TI);
+ };
+
+ auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+ if (IsReturnBlock(Succ1))
+ return std::make_tuple(Succ1, Succ2);
+ if (IsReturnBlock(Succ2))
+ return std::make_tuple(Succ2, Succ1);
+
+ return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+ };
+
+ // Detect a triangular shape:
+ auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+ if (IsSuccessor(Succ1, Succ2))
+ return std::make_tuple(Succ1, Succ2);
+ if (IsSuccessor(Succ2, Succ1))
+ return std::make_tuple(Succ2, Succ1);
+
+ return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+ };
+
+ std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
+ std::make_unique<FunctionOutliningInfo>();
+
+ BasicBlock *CurrEntry = EntryBlock;
+ bool CandidateFound = false;
+ do {
+ // The number of blocks to be inlined has already reached
+ // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
+ // disables partial inlining for the function.
+ if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
+ break;
+
+ if (succ_size(CurrEntry) != 2)
+ break;
+
+ BasicBlock *Succ1 = *succ_begin(CurrEntry);
+ BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
+
+ BasicBlock *ReturnBlock, *NonReturnBlock;
+ std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+
+ if (ReturnBlock) {
+ OutliningInfo->Entries.push_back(CurrEntry);
+ OutliningInfo->ReturnBlock = ReturnBlock;
+ OutliningInfo->NonReturnBlock = NonReturnBlock;
+ CandidateFound = true;
+ break;
+ }
+
+ BasicBlock *CommSucc;
+ BasicBlock *OtherSucc;
+ std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
+
+ if (!CommSucc)
+ break;
+
+ OutliningInfo->Entries.push_back(CurrEntry);
+ CurrEntry = OtherSucc;
+ } while (true);
+
+ if (!CandidateFound)
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Do sanity check of the entries: threre should not
+ // be any successors (not in the entry set) other than
+ // {ReturnBlock, NonReturnBlock}
+ assert(OutliningInfo->Entries[0] == &F->front() &&
+ "Function Entry must be the first in Entries vector");
+ DenseSet<BasicBlock *> Entries;
+ for (BasicBlock *E : OutliningInfo->Entries)
+ Entries.insert(E);
+
+ // Returns true of BB has Predecessor which is not
+ // in Entries set.
+ auto HasNonEntryPred = [Entries](BasicBlock *BB) {
+ for (auto Pred : predecessors(BB)) {
+ if (!Entries.count(Pred))
+ return true;
+ }
+ return false;
+ };
+ auto CheckAndNormalizeCandidate =
+ [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
+ for (BasicBlock *E : OutliningInfo->Entries) {
+ for (auto Succ : successors(E)) {
+ if (Entries.count(Succ))
+ continue;
+ if (Succ == OutliningInfo->ReturnBlock)
+ OutliningInfo->ReturnBlockPreds.push_back(E);
+ else if (Succ != OutliningInfo->NonReturnBlock)
+ return false;
+ }
+ // There should not be any outside incoming edges either:
+ if (HasNonEntryPred(E))
+ return false;
+ }
+ return true;
+ };
+
+ if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Now further growing the candidate's inlining region by
+ // peeling off dominating blocks from the outlining region:
+ while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
+ BasicBlock *Cand = OutliningInfo->NonReturnBlock;
+ if (succ_size(Cand) != 2)
+ break;
+
+ if (HasNonEntryPred(Cand))
+ break;
+
+ BasicBlock *Succ1 = *succ_begin(Cand);
+ BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
+
+ BasicBlock *ReturnBlock, *NonReturnBlock;
+ std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+ if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
+ break;
+
+ if (NonReturnBlock->getSinglePredecessor() != Cand)
+ break;
+
+ // Now grow and update OutlininigInfo:
+ OutliningInfo->Entries.push_back(Cand);
+ OutliningInfo->NonReturnBlock = NonReturnBlock;
+ OutliningInfo->ReturnBlockPreds.push_back(Cand);
+ Entries.insert(Cand);
+ }
+
+ return OutliningInfo;
+}
+
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+ if (F->hasProfileData())
+ return true;
+ // Now check if any of the entry block has MD_prof data:
+ for (auto *E : OI->Entries) {
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
+BranchProbability
+PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
+ BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
+ auto EntryFreq =
+ Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+ auto OutliningCallFreq =
+ Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
+ // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
+ // we outlined any regions, so we may encounter situations where the
+ // OutliningCallFreq is *slightly* bigger than the EntryFreq.
+ if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
+ OutliningCallFreq = EntryFreq;
+ }
+ auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
+ OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
+
+ if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be conservative in
+ // estimating the overall savings. Static branch prediction can usually
+ // guess the branch direction right (taken/non-taken), but the guessed
+ // branch probability is usually not biased enough. In case when the
+ // outlined region is predicted to be likely, its probability needs
+ // to be made higher (more biased) to not under-estimate the cost of
+ // function outlining. On the other hand, if the outlined region
+ // is predicted to be less likely, the predicted probablity is usually
+ // higher than the actual. For instance, the actual probability of the
+ // less likely target is only 5%, but the guessed probablity can be
+ // 40%. In the latter case, there is no need for further adjustement.
+ // FIXME: add an option for this.
+ if (OutlineRegionRelFreq < BranchProbability(45, 100))
+ return OutlineRegionRelFreq;
+
+ OutlineRegionRelFreq = std::max(
+ OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallSite CS, FunctionCloner &Cloner,
+ BlockFrequency WeightedOutliningRcost,
+ OptimizationRemarkEmitter &ORE) {
+ using namespace ore;
+
+ Instruction *Call = CS.getInstruction();
+ Function *Callee = CS.getCalledFunction();
+ assert(Callee == Cloner.ClonedFunc);
+
+ if (SkipCostAnalysis)
+ return isInlineViable(*Callee);
+
+ Function *Caller = CS.getCaller();
+ auto &CalleeTTI = (*GetTTI)(*Callee);
+ bool RemarksEnabled =
+ Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
+ DEBUG_TYPE);
+ assert(Call && "invalid callsite for partial inline");
+ InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
+ CalleeTTI, *GetAssumptionCache, GetBFI, PSI,
+ RemarksEnabled ? &ORE : nullptr);
+
+ if (IC.isAlways()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+ << NV("Callee", Cloner.OrigFunc)
+ << " should always be fully inlined, not partially";
+ });
+ return false;
+ }
+
+ if (IC.isNever()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller)
+ << " because it should never be inlined (cost=never)";
+ });
+ return false;
+ }
+
+ if (!IC) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller) << " because too costly to inline (cost="
+ << NV("Cost", IC.getCost()) << ", threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+ });
+ return false;
+ }
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < WeightedOutliningRcost) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
+ Call)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
+ << ")"
+ << " of making the outlined call is too high";
+ });
+
+ return false;
+ }
+
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
+ << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
+ << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+ << " (threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+ });
+ return true;
+}
+
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ // Skip free instructions.
+ switch (I.getOpcode()) {
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::Alloca:
+ case Instruction::PHI:
+ continue;
+ case Instruction::GetElementPtr:
+ if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (I.isLifetimeStartOrEnd())
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ InlineCost += getCallsiteCost(*CI, DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+ InlineCost += getCallsiteCost(*II, DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int>
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
+ int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
+ for (auto FuncBBPair : Cloner.OutlinedFunctions) {
+ Function *OutlinedFunc = FuncBBPair.first;
+ BasicBlock* OutliningCallBB = FuncBBPair.second;
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
+ OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
+
+ // Now compute the cost of the extracted/outlined function itself:
+ for (BasicBlock &BB : *OutlinedFunc)
+ OutlinedFunctionCost += computeBBInlineCost(&BB);
+ }
+ assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+
+ // The code extractor introduces a new root and exit stub blocks with
+ // additional unconditional branches. Those branches will be eliminated
+ // later with bb layout. The cost should be adjusted accordingly:
+ OutlinedFunctionCost -=
+ 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
+
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost +
+ (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+ ExtraOutliningPenalty;
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ std::unique_ptr<BlockFrequencyInfo> TempBFI;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+ CurrentCallerBFI = TempBFI.get();
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(*GetBFI)(*Caller);
+ }
+ };
+
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+ Function *Caller = CS.getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CS.getInstruction()->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+ Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
+ function_ref<AssumptionCache *(Function &)> LookupAC)
+ : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
+ ClonedOI = std::make_unique<FunctionOutliningInfo>();
+
+ // Clone the function, so that we can hack away on it.
+ ValueToValueMapTy VMap;
+ ClonedFunc = CloneFunction(F, VMap);
+
+ ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
+ for (BasicBlock *BB : OI->Entries) {
+ ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+ }
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
+ BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+ ClonedOI->ReturnBlockPreds.push_back(NewE);
+ }
+ // Go ahead and update all uses to the duplicate, so that we can just
+ // use the inliner functionality when we're done hacking.
+ F->replaceAllUsesWith(ClonedFunc);
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+ Function *F, FunctionOutliningMultiRegionInfo *OI,
+ OptimizationRemarkEmitter &ORE,
+ function_ref<AssumptionCache *(Function &)> LookupAC)
+ : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
+ ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+ // Clone the function, so that we can hack away on it.
+ ValueToValueMapTy VMap;
+ ClonedFunc = CloneFunction(F, VMap);
+
+ // Go through all Outline Candidate Regions and update all BasicBlock
+ // information.
+ for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+ OI->ORI) {
+ SmallVector<BasicBlock *, 8> Region;
+ for (BasicBlock *BB : RegionInfo.Region) {
+ Region.push_back(cast<BasicBlock>(VMap[BB]));
+ }
+ BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
+ BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
+ BasicBlock *NewReturnBlock = nullptr;
+ if (RegionInfo.ReturnBlock)
+ NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
+ FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
+ Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
+ ClonedOMRI->ORI.push_back(MappedRegionInfo);
+ }
+ // Go ahead and update all uses to the duplicate, so that we can just
+ // use the inliner functionality when we're done hacking.
+ F->replaceAllUsesWith(ClonedFunc);
+}
+
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
+ auto getFirstPHI = [](BasicBlock *BB) {
+ BasicBlock::iterator I = BB->begin();
+ PHINode *FirstPhi = nullptr;
+ while (I != BB->end()) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ break;
+ if (!FirstPhi) {
+ FirstPhi = Phi;
+ break;
+ }
+ }
+ return FirstPhi;
+ };
+
+ // Shouldn't need to normalize PHIs if we're not outlining non-early return
+ // blocks.
+ if (!ClonedOI)
+ return;
+
+ // Special hackery is needed with PHI nodes that have inputs from more than
+ // one extracted block. For simplicity, just split the PHIs into a two-level
+ // sequence of PHIs, some of which will go in the extracted region, and some
+ // of which will go outside.
+ BasicBlock *PreReturn = ClonedOI->ReturnBlock;
+ // only split block when necessary:
+ PHINode *FirstPhi = getFirstPHI(PreReturn);
+ unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+ if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+ return;
+
+ auto IsTrivialPhi = [](PHINode *PN) -> Value * {
+ Value *CommonValue = PN->getIncomingValue(0);
+ if (all_of(PN->incoming_values(),
+ [&](Value *V) { return V == CommonValue; }))
+ return CommonValue;
+ return nullptr;
+ };
+
+ ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+ ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+ BasicBlock::iterator I = PreReturn->begin();
+ Instruction *Ins = &ClonedOI->ReturnBlock->front();
+ SmallVector<Instruction *, 4> DeadPhis;
+ while (I != PreReturn->end()) {
+ PHINode *OldPhi = dyn_cast<PHINode>(I);
+ if (!OldPhi)
+ break;
+
+ PHINode *RetPhi =
+ PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+ OldPhi->replaceAllUsesWith(RetPhi);
+ Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
+
+ RetPhi->addIncoming(&*I, PreReturn);
+ for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+ RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+ OldPhi->removeIncomingValue(E);
+ }
+
+ // After incoming values splitting, the old phi may become trivial.
+ // Keeping the trivial phi can introduce definition inside the outline
+ // region which is live-out, causing necessary overhead (load, store
+ // arg passing etc).
+ if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+ OldPhi->replaceAllUsesWith(OldPhiVal);
+ DeadPhis.push_back(OldPhi);
+ }
+ ++I;
+ }
+ for (auto *DP : DeadPhis)
+ DP->eraseFromParent();
+
+ for (auto E : ClonedOI->ReturnBlockPreds) {
+ E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+ }
+}
+
+bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
+
+ auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
+ int Cost = 0;
+ for (BasicBlock* BB : Region)
+ Cost += computeBBInlineCost(BB);
+ return Cost;
+ };
+
+ assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
+
+ if (ClonedOMRI->ORI.empty())
+ return false;
+
+ // The CodeExtractor needs a dominator tree.
+ DominatorTree DT;
+ DT.recalculate(*ClonedFunc);
+
+ // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*ClonedFunc, LI);
+ ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+ // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+ CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+
+ SetVector<Value *> Inputs, Outputs, Sinks;
+ for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+ ClonedOMRI->ORI) {
+ int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
+
+ CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
+ ClonedFuncBFI.get(), &BPI,
+ LookupAC(*RegionInfo.EntryBlock->getParent()),
+ /* AllowVarargs */ false);
+
+ CE.findInputsOutputs(Inputs, Outputs, Sinks);
+
+#ifndef NDEBUG
+ if (TracePartialInlining) {
+ dbgs() << "inputs: " << Inputs.size() << "\n";
+ dbgs() << "outputs: " << Outputs.size() << "\n";
+ for (Value *value : Inputs)
+ dbgs() << "value used in func: " << *value << "\n";
+ for (Value *output : Outputs)
+ dbgs() << "instr used in func: " << *output << "\n";
+ }
+#endif
+ // Do not extract regions that have live exit variables.
+ if (Outputs.size() > 0 && !ForceLiveExit)
+ continue;
+
+ Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
+
+ if (OutlinedFunc) {
+ CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
+ BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
+ assert(OutliningCallBB->getParent() == ClonedFunc);
+ OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
+ NumColdRegionsOutlined++;
+ OutlinedRegionCost += CurrentOutlinedRegionCost;
+
+ if (MarkOutlinedColdCC) {
+ OutlinedFunc->setCallingConv(CallingConv::Cold);
+ OCS.setCallingConv(CallingConv::Cold);
+ }
+ } else
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+ &RegionInfo.Region.front()->front())
+ << "Failed to extract region at block "
+ << ore::NV("Block", RegionInfo.Region.front());
+ });
+ }
+
+ return !OutlinedFunctions.empty();
+}
+
+Function *
+PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
+ // Returns true if the block is to be partial inlined into the caller
+ // (i.e. not to be extracted to the out of line function)
+ auto ToBeInlined = [&, this](BasicBlock *BB) {
+ return BB == ClonedOI->ReturnBlock ||
+ (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
+ ClonedOI->Entries.end());
+ };
+
+ assert(ClonedOI && "Expecting OutlineInfo for single region outline");
+ // The CodeExtractor needs a dominator tree.
+ DominatorTree DT;
+ DT.recalculate(*ClonedFunc);
+
+ // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*ClonedFunc, LI);
+ ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+ // Gather up the blocks that we're going to extract.
+ std::vector<BasicBlock *> ToExtract;
+ ToExtract.push_back(ClonedOI->NonReturnBlock);
+ OutlinedRegionCost +=
+ PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+ for (BasicBlock &BB : *ClonedFunc)
+ if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
+ ToExtract.push_back(&BB);
+ // FIXME: the code extractor may hoist/sink more code
+ // into the outlined function which may make the outlining
+ // overhead (the difference of the outlined function cost
+ // and OutliningRegionCost) look larger.
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
+
+ // Extract the body of the if.
+ CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+ Function *OutlinedFunc =
+ CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+ ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
+ /* AllowVarargs */ true)
+ .extractCodeRegion(CEAC);
+
+ if (OutlinedFunc) {
+ BasicBlock *OutliningCallBB =
+ PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+ .getInstruction()
+ ->getParent();
+ assert(OutliningCallBB->getParent() == ClonedFunc);
+ OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
+ } else
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+ &ToExtract.front()->front())
+ << "Failed to extract region at block "
+ << ore::NV("Block", ToExtract.front());
+ });
+
+ return OutlinedFunc;
+}
+
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ ClonedFunc->replaceAllUsesWith(OrigFunc);
+ ClonedFunc->eraseFromParent();
+ if (!IsFunctionInlined) {
+ // Remove each function that was speculatively created if there is no
+ // reference.
+ for (auto FuncBBPair : OutlinedFunctions) {
+ Function *Func = FuncBBPair.first;
+ Func->eraseFromParent();
+ }
+ }
+}
+
+std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
+
+ if (F->hasAddressTaken())
+ return {false, nullptr};
+
+ // Let inliner handle it
+ if (F->hasFnAttribute(Attribute::AlwaysInline))
+ return {false, nullptr};
+
+ if (F->hasFnAttribute(Attribute::NoInline))
+ return {false, nullptr};
+
+ if (PSI->isFunctionEntryCold(F))
+ return {false, nullptr};
+
+ if (F->users().empty())
+ return {false, nullptr};
+
+ OptimizationRemarkEmitter ORE(F);
+
+ // Only try to outline cold regions if we have a profile summary, which
+ // implies we have profiling information.
+ if (PSI->hasProfileSummary() && F->hasProfileData() &&
+ !DisableMultiRegionPartialInline) {
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
+ computeOutliningColdRegionsInfo(F, ORE);
+ if (OMRI) {
+ FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
+
+#ifndef NDEBUG
+ if (TracePartialInlining) {
+ dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
+ dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
+ << "\n";
+ }
+#endif
+ bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
+
+ if (DidOutline) {
+#ifndef NDEBUG
+ if (TracePartialInlining) {
+ dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
+ Cloner.ClonedFunc->print(dbgs());
+ dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
+ }
+#endif
+
+ if (tryPartialInline(Cloner))
+ return {true, nullptr};
+ }
+ }
+ }
+
+ // Fall-thru to regular partial inlining if we:
+ // i) can't find any cold regions to outline, or
+ // ii) can't inline the outlined function anywhere.
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+ if (!OI)
+ return {false, nullptr};
+
+ FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
+ Cloner.NormalizeReturnBlock();
+
+ Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
+
+ if (!OutlinedFunction)
+ return {false, nullptr};
+
+ bool AnyInline = tryPartialInline(Cloner);
+
+ if (AnyInline)
+ return {true, OutlinedFunction};
+
+ return {false, nullptr};
+}
+
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
+ if (Cloner.OutlinedFunctions.empty())
+ return false;
+
+ int SizeCost = 0;
+ BlockFrequency WeightedRcost;
+ int NonWeightedRcost;
+ std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
+
+ // Only calculate RelativeToEntryFreq when we are doing single region
+ // outlining.
+ BranchProbability RelativeToEntryFreq;
+ if (Cloner.ClonedOI) {
+ RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+ } else
+ // RelativeToEntryFreq doesn't make sense when we have more than one
+ // outlined call because each call will have a different relative frequency
+ // to the entry block. We can consider using the average, but the
+ // usefulness of that information is questionable. For now, assume we never
+ // execute the calls to outlined functions.
+ RelativeToEntryFreq = BranchProbability(0, 1);
+
+ WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+
+ // The call sequence(s) to the outlined function(s) are larger than the sum of
+ // the original outlined region size(s), it does not increase the chances of
+ // inlining the function with outlining (The inliner uses the size increase to
+ // model the cost of inlining a callee).
+ if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+ DebugLoc DLoc;
+ BasicBlock *Block;
+ std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
+ OrigFuncORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", Cloner.OrigFunc)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")";
+ });
+ return false;
+ }
+
+ assert(Cloner.OrigFunc->users().empty() &&
+ "F's users should all be replaced!");
+
+ std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+ Cloner.ClonedFunc->user_end());
+
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+ if (CalleeEntryCount)
+ computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
+
+ uint64_t CalleeEntryCountV =
+ (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
+
+ bool AnyInline = false;
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+
+ if (IsLimitReached())
+ continue;
+
+ OptimizationRemarkEmitter CallerORE(CS.getCaller());
+ if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
+ continue;
+
+ // Construct remark before doing the inlining, as after successful inlining
+ // the callsite is removed.
+ OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
+ OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller());
+
+ InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
+ // We can only forward varargs when we outlined a single region, else we
+ // bail on vararg functions.
+ if (!InlineFunction(CS, IFI, nullptr, true,
+ (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
+ : nullptr)))
+ continue;
+
+ CallerORE.emit(OR);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
+ NumPartialInlining++;
+ // Update the stats
+ if (Cloner.ClonedOI)
+ NumPartialInlined++;
+ else
+ NumColdOutlinePartialInlined++;
+
+ }
+
+ if (AnyInline) {
+ Cloner.IsFunctionInlined = true;
+ if (CalleeEntryCount)
+ Cloner.OrigFunc->setEntryCount(
+ CalleeEntryCount.setCount(CalleeEntryCountV));
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+ OrigFuncORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
+ << "Partially inlined into at least one caller";
+ });
+
+ }
+
+ return AnyInline;
+}
+
+bool PartialInlinerImpl::run(Module &M) {
+ if (DisablePartialInlining)
+ return false;
+
+ std::vector<Function *> Worklist;
+ Worklist.reserve(M.size());
+ for (Function &F : M)
+ if (!F.use_empty() && !F.isDeclaration())
+ Worklist.push_back(&F);
+
+ bool Changed = false;
+ while (!Worklist.empty()) {
+ Function *CurrFunc = Worklist.back();
+ Worklist.pop_back();
+
+ if (CurrFunc->use_empty())
+ continue;
+
+ bool Recursive = false;
+ for (User *U : CurrFunc->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent()->getParent() == CurrFunc) {
+ Recursive = true;
+ break;
+ }
+ if (Recursive)
+ continue;
+
+ std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
+ if (Result.second)
+ Worklist.push_back(Result.second);
+ Changed |= Result.first;
+ }
+
+ return Changed;
+}
+
+char PartialInlinerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+
+ModulePass *llvm::createPartialInliningPass() {
+ return new PartialInlinerLegacyPass();
+}
+
+PreservedAnalyses PartialInlinerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&FAM](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+
+ auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
+ return FAM.getCachedResult<AssumptionAnalysis>(F);
+ };
+
+ std::function<BlockFrequencyInfo &(Function &)> GetBFI =
+ [&FAM](Function &F) -> BlockFrequencyInfo & {
+ return FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ std::function<TargetTransformInfo &(Function &)> GetTTI =
+ [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+ if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
+ {GetBFI}, PSI)
+ .run(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}