diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
20 files changed, 1029 insertions, 260 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 30a1f81ad0e1..6730824e860a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, if (isNoModRef(MRI)) continue; + // A pseudo probe call shouldn't change any function attribute since it + // doesn't translate to a real instruction. It comes with a memory access + // tag to prevent itself being removed by optimizations and not block + // other instructions being optimized. + if (isa<PseudoProbeInst>(I)) + continue; + if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { // The call could access any memory. If that includes writes, note it. if (isModSet(MRI)) @@ -1445,8 +1452,7 @@ static bool functionWillReturn(const Function &F) { // If there are no loops, then the function is willreturn if all calls in // it are willreturn. return all_of(instructions(F), [](const Instruction &I) { - const auto *CB = dyn_cast<CallBase>(&I); - return !CB || CB->hasFnAttr(Attribute::WillReturn); + return I.willReturn(); }); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index 37fc27e91100..158fa0771c3b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -30,7 +30,7 @@ namespace llvm { ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, StringRef CalleeName) { if (CalleeName.empty()) - return getChildContext(CallSite); + return getHottestChildContext(CallSite); uint32_t Hash = nodeHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); @@ -40,18 +40,22 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, } ContextTrieNode * -ContextTrieNode::getChildContext(const LineLocation &CallSite) { +ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { // CSFDO-TODO: This could be slow, change AllChildContext so we can // do point look up for child node by call site alone. - // CSFDO-TODO: Return the child with max count for indirect call + // Retrieve the child node with max count for indirect call ContextTrieNode *ChildNodeRet = nullptr; + uint64_t MaxCalleeSamples = 0; for (auto &It : AllChildContext) { ContextTrieNode &ChildNode = It.second; - if (ChildNode.CallSiteLoc == CallSite) { - if (ChildNodeRet) - return nullptr; - else - ChildNodeRet = &ChildNode; + if (ChildNode.CallSiteLoc != CallSite) + continue; + FunctionSamples *Samples = ChildNode.getFunctionSamples(); + if (!Samples) + continue; + if (Samples->getTotalSamples() > MaxCalleeSamples) { + ChildNodeRet = &ChildNode; + MaxCalleeSamples = Samples->getTotalSamples(); } } @@ -179,7 +183,7 @@ SampleContextTracker::SampleContextTracker( SampleContext Context(FuncSample.first(), RawContext); LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n"); if (!Context.isBaseContext()) - FuncToCtxtProfileSet[Context.getName()].insert(FSamples); + FuncToCtxtProfileSet[Context.getNameWithoutContext()].insert(FSamples); ContextTrieNode *NewNode = getOrCreateContextPath(Context, true); assert(!NewNode->getFunctionSamples() && "New node can't have sample profile"); @@ -191,12 +195,12 @@ FunctionSamples * SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n"); - // CSFDO-TODO: We use CalleeName to differentiate indirect call - // We need to get sample for indirect callee too. DILocation *DIL = Inst.getDebugLoc(); if (!DIL) return nullptr; + // For indirect call, CalleeName will be empty, in which case the context + // profile for callee with largest total samples will be returned. ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName); if (CalleeContext) { FunctionSamples *FSamples = CalleeContext->getFunctionSamples(); @@ -209,6 +213,26 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, return nullptr; } +std::vector<const FunctionSamples *> +SampleContextTracker::getIndirectCalleeContextSamplesFor( + const DILocation *DIL) { + std::vector<const FunctionSamples *> R; + if (!DIL) + return R; + + ContextTrieNode *CallerNode = getContextFor(DIL); + LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + if (ChildNode.getCallSiteLoc() != CallSite) + continue; + if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples()) + R.push_back(CalleeSamples); + } + + return R; +} + FunctionSamples * SampleContextTracker::getContextSamplesFor(const DILocation *DIL) { assert(DIL && "Expect non-null location"); @@ -239,6 +263,17 @@ SampleContextTracker::getContextSamplesFor(const SampleContext &Context) { return Node->getFunctionSamples(); } +SampleContextTracker::ContextSamplesTy & +SampleContextTracker::getAllContextSamplesFor(const Function &Func) { + StringRef CanonName = FunctionSamples::getCanonicalFnName(Func); + return FuncToCtxtProfileSet[CanonName]; +} + +SampleContextTracker::ContextSamplesTy & +SampleContextTracker::getAllContextSamplesFor(StringRef Name) { + return FuncToCtxtProfileSet[Name]; +} + FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func, bool MergeContext) { StringRef CanonName = FunctionSamples::getCanonicalFnName(Func); @@ -295,11 +330,6 @@ void SampleContextTracker::promoteMergeContextSamplesTree( const Instruction &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n" << Inst << "\n"); - // CSFDO-TODO: We also need to promote context profile from indirect - // calls. We won't have callee names from those from call instr. - if (CalleeName.empty()) - return; - // Get the caller context for the call instruction, we don't use callee // name from call because there can be context from indirect calls too. DILocation *DIL = Inst.getDebugLoc(); @@ -308,8 +338,23 @@ void SampleContextTracker::promoteMergeContextSamplesTree( return; // Get the context that needs to be promoted - LineLocation CallSite(FunctionSamples::getOffset(DIL), - DIL->getBaseDiscriminator()); + LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + // For indirect call, CalleeName will be empty, in which case we need to + // promote all non-inlined child context profiles. + if (CalleeName.empty()) { + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode *NodeToPromo = &It.second; + if (CallSite != NodeToPromo->getCallSiteLoc()) + continue; + FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples(); + if (FromSamples && FromSamples->getContext().hasState(InlinedContext)) + continue; + promoteMergeContextSamplesTree(*NodeToPromo); + } + return; + } + + // Get the context for the given callee that needs to be promoted ContextTrieNode *NodeToPromo = CallerNode->getChildContext(CallSite, CalleeName); if (!NodeToPromo) @@ -329,6 +374,8 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( LLVM_DEBUG(dbgs() << " Found context tree root to promote: " << FromSamples->getContext() << "\n"); + assert(!FromSamples->getContext().hasState(InlinedContext) && + "Shouldn't promote inlined context profile"); StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext(); return promoteMergeContextSamplesTree(NodeToPromo, RootContext, ContextStrToRemove); @@ -361,18 +408,14 @@ SampleContextTracker::getCalleeContextFor(const DILocation *DIL, StringRef CalleeName) { assert(DIL && "Expect non-null location"); - // CSSPGO-TODO: need to support indirect callee - if (CalleeName.empty()) - return nullptr; - ContextTrieNode *CallContext = getContextFor(DIL); if (!CallContext) return nullptr; + // When CalleeName is empty, the child context profile with max + // total samples will be returned. return CallContext->getChildContext( - LineLocation(FunctionSamples::getOffset(DIL), - DIL->getBaseDiscriminator()), - CalleeName); + FunctionSamples::getCallSiteIdentifier(DIL), CalleeName); } ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) { @@ -386,8 +429,8 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) { if (Name.empty()) Name = PrevDIL->getScope()->getSubprogram()->getName(); S.push_back( - std::make_pair(LineLocation(FunctionSamples::getOffset(DIL), - DIL->getBaseDiscriminator()), Name)); + std::make_pair(FunctionSamples::getCallSiteIdentifier(DIL), + PrevDIL->getScope()->getSubprogram()->getLinkageName())); PrevDIL = DIL; } @@ -518,4 +561,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( return *ToNode; } +// Replace call graph edges with dynamic call edges from the profile. +void SampleContextTracker::addCallGraphEdges(CallGraph &CG, + StringMap<Function *> &SymbolMap) { + // Add profile call edges to the call graph. + std::queue<ContextTrieNode *> NodeQueue; + NodeQueue.push(&RootContext); + while (!NodeQueue.empty()) { + ContextTrieNode *Node = NodeQueue.front(); + NodeQueue.pop(); + Function *F = SymbolMap.lookup(Node->getFuncName()); + for (auto &I : Node->getAllChildContext()) { + ContextTrieNode *ChildNode = &I.second; + NodeQueue.push(ChildNode); + if (F && !F->isDeclaration()) { + Function *Callee = SymbolMap.lookup(ChildNode->getFuncName()); + if (Callee && !Callee->isDeclaration()) + CG[F]->addCalledFunction(nullptr, CG[Callee]); + } + } + } +} } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index 264ac4065e8c..a6a419bfe742 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -107,6 +108,16 @@ STATISTIC(NumCSNotInlined, STATISTIC(NumMismatchedProfile, "Number of functions with CFG mismatched profile"); STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile"); +STATISTIC(NumDuplicatedInlinesite, + "Number of inlined callsites with a partial distribution factor"); + +STATISTIC(NumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(NumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + NumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); // Command line option to specify the file to read samples from. This is // mainly used for debugging. @@ -166,11 +177,53 @@ static cl::opt<bool> ProfileTopDownLoad( "order of call graph during sample profile loading. It only " "works for new pass manager. ")); +static cl::opt<bool> UseProfileIndirectCallEdges( + "use-profile-indirect-call-edges", cl::init(true), cl::Hidden, + cl::desc("Considering indirect call samples from profile when top-down " + "processing functions. Only CSSPGO is supported.")); + +static cl::opt<bool> UseProfileTopDownOrder( + "use-profile-top-down-order", cl::init(false), cl::Hidden, + cl::desc("Process functions in one SCC in a top-down order " + "based on the input profile.")); + static cl::opt<bool> ProfileSizeInline( "sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size.")); +static cl::opt<int> ProfileInlineGrowthLimit( + "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), + cl::desc("The size growth ratio limit for proirity-based sample profile " + "loader inlining.")); + +static cl::opt<int> ProfileInlineLimitMin( + "sample-profile-inline-limit-min", cl::Hidden, cl::init(100), + cl::desc("The lower bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt<int> ProfileInlineLimitMax( + "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), + cl::desc("The upper bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt<int> ProfileICPThreshold( + "sample-profile-icp-threshold", cl::Hidden, cl::init(5), + cl::desc( + "Relative hotness threshold for indirect " + "call promotion in proirity-based sample profile loader inlining.")); + +static cl::opt<int> SampleHotCallSiteThreshold( + "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), + cl::desc("Hot callsite threshold for proirity-based sample profile loader " + "inlining.")); + +static cl::opt<bool> CallsitePrioritizedInline( + "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore, + cl::init(false), + cl::desc("Use call site prioritized inlining for sample profile loader." + "Currently only CSSPGO is supported.")); + static cl::opt<int> SampleColdCallSiteThreshold( "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites")); @@ -313,6 +366,38 @@ private: DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; }; +// Inline candidate used by iterative callsite prioritized inliner +struct InlineCandidate { + CallBase *CallInstr; + const FunctionSamples *CalleeSamples; + // Prorated callsite count, which will be used to guide inlining. For example, + // if a callsite is duplicated in LTO prelink, then in LTO postlink the two + // copies will get their own distribution factors and their prorated counts + // will be used to decide if they should be inlined independently. + uint64_t CallsiteCount; + // Call site distribution factor to prorate the profile samples for a + // duplicated callsite. Default value is 1.0. + float CallsiteDistribution; +}; + +// Inline candidate comparer using call site weight +struct CandidateComparer { + bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using CandidateQueue = + PriorityQueue<InlineCandidate, std::vector<InlineCandidate>, + CandidateComparer>; + /// Sample profile pass. /// /// This pass reads profile data from the file specified by @@ -350,9 +435,21 @@ protected: findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineCallInstruction(CallBase &CB); + // Attempt to promote indirect call and also inline the promoted call + bool tryPromoteAndInlineCandidate( + Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, + uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns, + SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); bool inlineHotFunctions(Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs); + InlineCost shouldInlineCandidate(InlineCandidate &Candidate); + bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); + bool + tryInlineCandidate(InlineCandidate &Candidate, + SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); + bool + inlineHotFunctionsWithPriority(Function &F, + DenseSet<GlobalValue::GUID> &InlinedGUIDs); // Inline cold/small functions in addition to hot ones bool shouldInlineColdCallee(CallBase &CallInst); void emitOptimizationRemarksForInlineCandidates( @@ -371,6 +468,8 @@ protected: uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); + void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples); + void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap); bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); void clearFunctionData(); @@ -808,7 +907,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) { const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0); if (R) { - uint64_t Samples = R.get(); + uint64_t Samples = R.get() * Probe->Factor; bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples); if (FirstMark) { ORE->emit([&]() { @@ -816,13 +915,17 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) { Remark << "Applied " << ore::NV("NumSamples", Samples); Remark << " samples from profile (ProbeId="; Remark << ore::NV("ProbeId", Probe->Id); + Remark << ", Factor="; + Remark << ore::NV("Factor", Probe->Factor); + Remark << ", OriginalSamples="; + Remark << ore::NV("OriginalSamples", R.get()); Remark << ")"; return Remark; }); } - LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst - << " - weight: " << R.get() << ")\n"); + << " - weight: " << R.get() << " - factor: " + << format("%0.2f", Probe->Factor) << ")\n"); return Samples; } return R; @@ -918,6 +1021,31 @@ SampleProfileLoader::findIndirectCallFunctionSamples( return R; } + auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { + assert(L && R && "Expect non-null FunctionSamples"); + if (L->getEntrySamples() != R->getEntrySamples()) + return L->getEntrySamples() > R->getEntrySamples(); + return FunctionSamples::getGUID(L->getName()) < + FunctionSamples::getGUID(R->getName()); + }; + + if (ProfileIsCS) { + auto CalleeSamples = + ContextTracker->getIndirectCalleeContextSamplesFor(DIL); + if (CalleeSamples.empty()) + return R; + + // For CSSPGO, we only use target context profile's entry count + // as that already includes both inlined callee and non-inlined ones.. + Sum = 0; + for (const auto *const FS : CalleeSamples) { + Sum += FS->getEntrySamples(); + R.push_back(FS); + } + llvm::sort(R, FSCompare); + return R; + } + const FunctionSamples *FS = findFunctionSamples(Inst); if (FS == nullptr) return R; @@ -935,12 +1063,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples( Sum += NameFS.second.getEntrySamples(); R.push_back(&NameFS.second); } - llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) { - if (L->getEntrySamples() != R->getEntrySamples()) - return L->getEntrySamples() > R->getEntrySamples(); - return FunctionSamples::getGUID(L->getName()) < - FunctionSamples::getGUID(R->getName()); - }); + llvm::sort(R, FSCompare); } return R; } @@ -977,42 +1100,64 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { return it.first->second; } -bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) { - if (ExternalInlineAdvisor) { - auto Advice = ExternalInlineAdvisor->getAdvice(CB); - if (!Advice->isInliningRecommended()) { - Advice->recordUnattemptedInlining(); - return false; +/// Attempt to promote indirect call and also inline the promoted call. +/// +/// \param F Caller function. +/// \param Candidate ICP and inline candidate. +/// \param Sum Sum of target counts for indirect call. +/// \param PromotedInsns Map to keep track of indirect call already processed. +/// \param Candidate ICP and inline candidate. +/// \param InlinedCallSite Output vector for new call sites exposed after +/// inlining. +bool SampleProfileLoader::tryPromoteAndInlineCandidate( + Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, + DenseSet<Instruction *> &PromotedInsns, + SmallVector<CallBase *, 8> *InlinedCallSite) { + const char *Reason = "Callee function not available"; + // R->getValue() != &F is to prevent promoting a recursive call. + // If it is a recursive call, we do not inline it as it could bloat + // the code exponentially. There is way to better handle this, e.g. + // clone the caller first, and inline the cloned caller if it is + // recursive. As llvm does not inline recursive calls, we will + // simply ignore it instead of handling it explicitly. + auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName()); + if (R != SymbolMap.end() && R->getValue() && + !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && + R->getValue()->hasFnAttribute("use-sample-profile") && + R->getValue() != &F && + isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) { + auto *DI = + &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(), + Candidate.CallsiteCount, Sum, false, ORE); + if (DI) { + Sum -= Candidate.CallsiteCount; + // Prorate the indirect callsite distribution. + // Do not update the promoted direct callsite distribution at this + // point since the original distribution combined with the callee + // profile will be used to prorate callsites from the callee if + // inlined. Once not inlined, the direct callsite distribution should + // be prorated so that the it will reflect the real callsite counts. + setProbeDistributionFactor(*Candidate.CallInstr, + Candidate.CallsiteDistribution * Sum / + SumOrigin); + PromotedInsns.insert(Candidate.CallInstr); + Candidate.CallInstr = DI; + if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) { + bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite); + if (!Inlined) { + // Prorate the direct callsite distribution so that it reflects real + // callsite counts. + setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution * + Candidate.CallsiteCount / + SumOrigin); + } + return Inlined; + } } - // Dummy record, we don't use it for replay. - Advice->recordInlining(); - } - - Function *CalledFunction = CB.getCalledFunction(); - assert(CalledFunction); - DebugLoc DLoc = CB.getDebugLoc(); - BasicBlock *BB = CB.getParent(); - InlineParams Params = getInlineParams(); - Params.ComputeFullInlineCost = true; - // Checks if there is anything in the reachable portion of the callee at - // this callsite that makes this inlining potentially illegal. Need to - // set ComputeFullInlineCost, otherwise getInlineCost may return early - // when cost exceeds threshold without checking all IRs in the callee. - // The acutal cost does not matter because we only checks isNever() to - // see if it is legal to inline the callsite. - InlineCost Cost = - getInlineCost(CB, Params, GetTTI(*CalledFunction), GetAC, GetTLI); - if (Cost.isNever()) { - ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) - << "incompatible inlining"); - return false; - } - InlineFunctionInfo IFI(nullptr, GetAC); - if (InlineFunction(CB, IFI).isSuccess()) { - // The call to InlineFunction erases I, so we can't pass it here. - emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, - true, CSINLINE_DEBUG); - return true; + } else { + LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to " + << Candidate.CalleeSamples->getFuncName() << " because " + << Reason << "\n"); } return false; } @@ -1078,10 +1223,11 @@ bool SampleProfileLoader::inlineHotFunctions( "ProfAccForSymsInList should be false when profile-sample-accurate " "is enabled"); - DenseMap<CallBase *, const FunctionSamples *> localNotInlinedCallSites; + DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; bool Changed = false; - while (true) { - bool LocalChanged = false; + bool LocalChanged = true; + while (LocalChanged) { + LocalChanged = false; SmallVector<CallBase *, 10> CIS; for (auto &BB : F) { bool Hot = false; @@ -1095,7 +1241,7 @@ bool SampleProfileLoader::inlineHotFunctions( "GUIDToFuncNameMap has to be populated"); AllCandidates.push_back(CB); if (FS->getEntrySamples() > 0 || ProfileIsCS) - localNotInlinedCallSites.try_emplace(CB, FS); + LocalNotInlinedCallSites.try_emplace(CB, FS); if (callsiteIsHot(FS, PSI)) Hot = true; else if (shouldInlineColdCallee(*CB)) @@ -1113,6 +1259,11 @@ bool SampleProfileLoader::inlineHotFunctions( } for (CallBase *I : CIS) { Function *CalledFunction = I->getCalledFunction(); + InlineCandidate Candidate = { + I, + LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I] + : nullptr, + 0 /* dummy count */, 1.0 /* dummy distribution factor */}; // Do not inline recursive calls. if (CalledFunction == &F) continue; @@ -1121,6 +1272,7 @@ bool SampleProfileLoader::inlineHotFunctions( continue; uint64_t Sum; for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { + uint64_t SumOrigin = Sum; if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); @@ -1129,65 +1281,34 @@ bool SampleProfileLoader::inlineHotFunctions( if (!callsiteIsHot(FS, PSI)) continue; - const char *Reason = "Callee function not available"; - // R->getValue() != &F is to prevent promoting a recursive call. - // If it is a recursive call, we do not inline it as it could bloat - // the code exponentially. There is way to better handle this, e.g. - // clone the caller first, and inline the cloned caller if it is - // recursive. As llvm does not inline recursive calls, we will - // simply ignore it instead of handling it explicitly. - auto CalleeFunctionName = FS->getFuncName(); - auto R = SymbolMap.find(CalleeFunctionName); - if (R != SymbolMap.end() && R->getValue() && - !R->getValue()->isDeclaration() && - R->getValue()->getSubprogram() && - R->getValue()->hasFnAttribute("use-sample-profile") && - R->getValue() != &F && - isLegalToPromote(*I, R->getValue(), &Reason)) { - uint64_t C = FS->getEntrySamples(); - auto &DI = - pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE); - Sum -= C; - PromotedInsns.insert(I); - // If profile mismatches, we should not attempt to inline DI. - if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) && - inlineCallInstruction(cast<CallBase>(DI))) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined(FS); - localNotInlinedCallSites.erase(I); - LocalChanged = true; - ++NumCSInlined; - } - } else { - LLVM_DEBUG(dbgs() - << "\nFailed to promote indirect call to " - << CalleeFunctionName << " because " << Reason << "\n"); + Candidate = {I, FS, FS->getEntrySamples(), 1.0}; + if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, + PromotedInsns)) { + LocalNotInlinedCallSites.erase(I); + LocalChanged = true; } } } else if (CalledFunction && CalledFunction->getSubprogram() && !CalledFunction->isDeclaration()) { - if (inlineCallInstruction(*I)) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined( - localNotInlinedCallSites[I]); - localNotInlinedCallSites.erase(I); + if (tryInlineCandidate(Candidate)) { + LocalNotInlinedCallSites.erase(I); LocalChanged = true; - ++NumCSInlined; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); } } - if (LocalChanged) { - Changed = true; - } else { - break; - } + Changed |= LocalChanged; } + // For CS profile, profile for not inlined context will be merged when + // base profile is being trieved + if (ProfileIsCS) + return Changed; + // Accumulate not inlined callsite information into notInlinedSamples - for (const auto &Pair : localNotInlinedCallSites) { + for (const auto &Pair : LocalNotInlinedCallSites) { CallBase *I = Pair.getFirst(); Function *Callee = I->getCalledFunction(); if (!Callee || Callee->isDeclaration()) @@ -1232,6 +1353,266 @@ bool SampleProfileLoader::inlineHotFunctions( return Changed; } +bool SampleProfileLoader::tryInlineCandidate( + InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) { + + CallBase &CB = *Candidate.CallInstr; + Function *CalledFunction = CB.getCalledFunction(); + assert(CalledFunction && "Expect a callee with definition"); + DebugLoc DLoc = CB.getDebugLoc(); + BasicBlock *BB = CB.getParent(); + + InlineCost Cost = shouldInlineCandidate(Candidate); + if (Cost.isNever()) { + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) + << "incompatible inlining"); + return false; + } + + if (!Cost) + return false; + + InlineFunctionInfo IFI(nullptr, GetAC); + if (InlineFunction(CB, IFI).isSuccess()) { + // The call to InlineFunction erases I, so we can't pass it here. + emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, + true, CSINLINE_DEBUG); + + // Now populate the list of newly exposed call sites. + if (InlinedCallSites) { + InlinedCallSites->clear(); + for (auto &I : IFI.InlinedCallSites) + InlinedCallSites->push_back(I); + } + + if (ProfileIsCS) + ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples); + ++NumCSInlined; + + // Prorate inlined probes for a duplicated inlining callsite which probably + // has a distribution less than 100%. Samples for an inlinee should be + // distributed among the copies of the original callsite based on each + // callsite's distribution factor for counts accuracy. Note that an inlined + // probe may come with its own distribution factor if it has been duplicated + // in the inlinee body. The two factor are multiplied to reflect the + // aggregation of duplication. + if (Candidate.CallsiteDistribution < 1) { + for (auto &I : IFI.InlinedCallSites) { + if (Optional<PseudoProbe> Probe = extractProbe(*I)) + setProbeDistributionFactor(*I, Probe->Factor * + Candidate.CallsiteDistribution); + } + NumDuplicatedInlinesite++; + } + + return true; + } + return false; +} + +bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, + CallBase *CB) { + assert(CB && "Expect non-null call instruction"); + + if (isa<IntrinsicInst>(CB)) + return false; + + // Find the callee's profile. For indirect call, find hottest target profile. + const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB); + if (!CalleeSamples) + return false; + + float Factor = 1.0; + if (Optional<PseudoProbe> Probe = extractProbe(*CB)) + Factor = Probe->Factor; + + uint64_t CallsiteCount = 0; + ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent()); + if (Weight) + CallsiteCount = Weight.get(); + if (CalleeSamples) + CallsiteCount = std::max( + CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor)); + + *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor}; + return true; +} + +InlineCost +SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { + std::unique_ptr<InlineAdvice> Advice = nullptr; + if (ExternalInlineAdvisor) { + Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr); + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + return InlineCost::getNever("not previously inlined"); + } + Advice->recordInlining(); + return InlineCost::getAlways("previously inlined"); + } + + // Adjust threshold based on call site hotness, only do this for callsite + // prioritized inliner because otherwise cost-benefit check is done earlier. + int SampleThreshold = SampleColdCallSiteThreshold; + if (CallsitePrioritizedInline) { + if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) + SampleThreshold = SampleHotCallSiteThreshold; + else if (!ProfileSizeInline) + return InlineCost::getNever("cold callsite"); + } + + Function *Callee = Candidate.CallInstr->getCalledFunction(); + assert(Callee && "Expect a definition for inline candidate of direct call"); + + InlineParams Params = getInlineParams(); + Params.ComputeFullInlineCost = true; + // Checks if there is anything in the reachable portion of the callee at + // this callsite that makes this inlining potentially illegal. Need to + // set ComputeFullInlineCost, otherwise getInlineCost may return early + // when cost exceeds threshold without checking all IRs in the callee. + // The acutal cost does not matter because we only checks isNever() to + // see if it is legal to inline the callsite. + InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params, + GetTTI(*Callee), GetAC, GetTLI); + + // Honor always inline and never inline from call analyzer + if (Cost.isNever() || Cost.isAlways()) + return Cost; + + // For old FDO inliner, we inline the call site as long as cost is not + // "Never". The cost-benefit check is done earlier. + if (!CallsitePrioritizedInline) { + return InlineCost::get(Cost.getCost(), INT_MAX); + } + + // Otherwise only use the cost from call analyzer, but overwite threshold with + // Sample PGO threshold. + return InlineCost::get(Cost.getCost(), SampleThreshold); +} + +bool SampleProfileLoader::inlineHotFunctionsWithPriority( + Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { + DenseSet<Instruction *> PromotedInsns; + assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); + + // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure + // Profile symbol list is ignored when profile-sample-accurate is on. + assert((!ProfAccForSymsInList || + (!ProfileSampleAccurate && + !F.hasFnAttribute("profile-sample-accurate"))) && + "ProfAccForSymsInList should be false when profile-sample-accurate " + "is enabled"); + + // Populating worklist with initial call sites from root inliner, along + // with call site weights. + CandidateQueue CQueue; + InlineCandidate NewCandidate; + for (auto &BB : F) { + for (auto &I : BB.getInstList()) { + auto *CB = dyn_cast<CallBase>(&I); + if (!CB) + continue; + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.push(NewCandidate); + } + } + + // Cap the size growth from profile guided inlining. This is needed even + // though cost of each inline candidate already accounts for callee size, + // because with top-down inlining, we can grow inliner size significantly + // with large number of smaller inlinees each pass the cost check. + assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && + "Max inline size limit should not be smaller than min inline size " + "limit."); + unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + if (ExternalInlineAdvisor) + SizeLimit = std::numeric_limits<unsigned>::max(); + + // Perform iterative BFS call site prioritized inlining + bool Changed = false; + while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { + InlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + CallBase *I = Candidate.CallInstr; + Function *CalledFunction = I->getCalledFunction(); + + if (CalledFunction == &F) + continue; + if (I->isIndirectCall()) { + if (PromotedInsns.count(I)) + continue; + uint64_t Sum; + auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); + uint64_t SumOrigin = Sum; + Sum *= Candidate.CallsiteDistribution; + for (const auto *FS : CalleeSamples) { + // TODO: Consider disable pre-lTO ICP for MonoLTO as well + if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { + FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), + PSI->getOrCompHotCountThreshold()); + continue; + } + uint64_t EntryCountDistributed = + FS->getEntrySamples() * Candidate.CallsiteDistribution; + // In addition to regular inline cost check, we also need to make sure + // ICP isn't introducing excessive speculative checks even if individual + // target looks beneficial to promote and inline. That means we should + // only do ICP when there's a small number dominant targets. + if (EntryCountDistributed < SumOrigin / ProfileICPThreshold) + break; + // TODO: Fix CallAnalyzer to handle all indirect calls. + // For indirect call, we don't run CallAnalyzer to get InlineCost + // before actual inlining. This is because we could see two different + // types from the same definition, which makes CallAnalyzer choke as + // it's expecting matching parameter type on both caller and callee + // side. See example from PR18962 for the triggering cases (the bug was + // fixed, but we generate different types). + if (!PSI->isHotCount(EntryCountDistributed)) + break; + SmallVector<CallBase *, 8> InlinedCallSites; + // Attach function profile for promoted indirect callee, and update + // call site count for the promoted inline candidate too. + Candidate = {I, FS, EntryCountDistributed, + Candidate.CallsiteDistribution}; + if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, + PromotedInsns, &InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } + } else if (CalledFunction && CalledFunction->getSubprogram() && + !CalledFunction->isDeclaration()) { + SmallVector<CallBase *, 8> InlinedCallSites; + if (tryInlineCandidate(Candidate, &InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { + findCalleeFunctionSamples(*I)->findInlinedFunctions( + InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); + } + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++NumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++NumCSInlinedHitMinLimit; + else + ++NumCSInlinedHitGrowthLimit; + } + + return Changed; +} + /// Find equivalence classes for the given block. /// /// This finds all the blocks that are guaranteed to execute the same @@ -1654,6 +2035,14 @@ void SampleProfileLoader::propagateWeights(Function &F) { auto T = FS->findCallTargetMapAt(CallSite); if (!T || T.get().empty()) continue; + // Prorate the callsite counts to reflect what is already done to the + // callsite, such as ICP or calliste cloning. + if (FunctionSamples::ProfileIsProbeBased) { + if (Optional<PseudoProbe> Probe = extractProbe(I)) { + if (Probe->Factor < 1) + T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor); + } + } SmallVector<InstrProfValueData, 2> SortedCallTargets = GetSortedValueDataFromCallTargets(T.get()); uint64_t Sum; @@ -1833,7 +2222,10 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { } DenseSet<GlobalValue::GUID> InlinedGUIDs; - Changed |= inlineHotFunctions(F, InlinedGUIDs); + if (ProfileIsCS && CallsitePrioritizedInline) + Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); + else + Changed |= inlineHotFunctions(F, InlinedGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); @@ -1898,6 +2290,45 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) +// Add inlined profile call edges to the call graph. +void SampleProfileLoader::addCallGraphEdges(CallGraph &CG, + const FunctionSamples &Samples) { + Function *Caller = SymbolMap.lookup(Samples.getFuncName()); + if (!Caller || Caller->isDeclaration()) + return; + + // Skip non-inlined call edges which are not important since top down inlining + // for non-CS profile is to get more precise profile matching, not to enable + // more inlining. + + for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { + for (const auto &InlinedSamples : CallsiteSamples.second) { + Function *Callee = SymbolMap.lookup(InlinedSamples.first); + if (Callee && !Callee->isDeclaration()) + CG[Caller]->addCalledFunction(nullptr, CG[Callee]); + addCallGraphEdges(CG, InlinedSamples.second); + } + } +} + +// Replace call graph edges with dynamic call edges from the profile. +void SampleProfileLoader::replaceCallGraphEdges( + CallGraph &CG, StringMap<Function *> &SymbolMap) { + // Remove static call edges from the call graph except for the ones from the + // root which make the call graph connected. + for (const auto &Node : CG) + if (Node.second.get() != CG.getExternalCallingNode()) + Node.second->removeAllCalledFunctions(); + + // Add profile call edges to the call graph. + if (ProfileIsCS) { + ContextTracker->addCallGraphEdges(CG, SymbolMap); + } else { + for (const auto &Samples : Reader->getProfiles()) + addCallGraphEdges(CG, Samples.second); + } +} + std::vector<Function *> SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { std::vector<Function *> FunctionOrderList; @@ -1920,16 +2351,97 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { } assert(&CG->getModule() == &M); + + // Add indirect call edges from profile to augment the static call graph. + // Functions will be processed in a top-down order defined by the static call + // graph. Adjusting the order by considering indirect call edges from the + // profile (which don't exist in the static call graph) can enable the + // inlining of indirect call targets by processing the caller before them. + // TODO: enable this for non-CS profile and fix the counts returning logic to + // have a full support for indirect calls. + if (UseProfileIndirectCallEdges && ProfileIsCS) { + for (auto &Entry : *CG) { + const auto *F = Entry.first; + if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile")) + continue; + auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName()); + if (AllContexts.empty()) + continue; + + for (const auto &BB : *F) { + for (const auto &I : BB.getInstList()) { + const auto *CB = dyn_cast<CallBase>(&I); + if (!CB || !CB->isIndirectCall()) + continue; + const DebugLoc &DLoc = I.getDebugLoc(); + if (!DLoc) + continue; + auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc); + for (FunctionSamples *Samples : AllContexts) { + if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) { + for (const auto &Target : CallTargets.get()) { + Function *Callee = SymbolMap.lookup(Target.first()); + if (Callee && !Callee->isDeclaration()) + Entry.second->addCalledFunction(nullptr, (*CG)[Callee]); + } + } + } + } + } + } + } + + // Compute a top-down order the profile which is used to sort functions in + // one SCC later. The static processing order computed for an SCC may not + // reflect the call contexts in the context-sensitive profile, thus may cause + // potential inlining to be overlooked. The function order in one SCC is being + // adjusted to a top-down order based on the profile to favor more inlining. + DenseMap<Function *, uint64_t> ProfileOrderMap; + if (UseProfileTopDownOrder || + (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) { + // Create a static call graph. The call edges are not important since they + // will be replaced by dynamic edges from the profile. + CallGraph ProfileCG(M); + replaceCallGraphEdges(ProfileCG, SymbolMap); + scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG); + uint64_t I = 0; + while (!CGI.isAtEnd()) { + for (CallGraphNode *Node : *CGI) { + if (auto *F = Node->getFunction()) + ProfileOrderMap[F] = ++I; + } + ++CGI; + } + } + scc_iterator<CallGraph *> CGI = scc_begin(CG); while (!CGI.isAtEnd()) { - for (CallGraphNode *node : *CGI) { - auto F = node->getFunction(); + uint64_t Start = FunctionOrderList.size(); + for (CallGraphNode *Node : *CGI) { + auto *F = Node->getFunction(); if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) FunctionOrderList.push_back(F); } + + // Sort nodes in SCC based on the profile top-down order. + if (!ProfileOrderMap.empty()) { + std::stable_sort(FunctionOrderList.begin() + Start, + FunctionOrderList.end(), + [&ProfileOrderMap](Function *Left, Function *Right) { + return ProfileOrderMap[Left] < ProfileOrderMap[Right]; + }); + } + ++CGI; } + LLVM_DEBUG({ + dbgs() << "Function processing order:\n"; + for (auto F : reverse(FunctionOrderList)) { + dbgs() << F->getName() << "\n"; + } + }); + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); return FunctionOrderList; } @@ -1978,6 +2490,12 @@ bool SampleProfileLoader::doInitialization(Module &M, ProfileIsCS = true; FunctionSamples::ProfileIsCS = true; + // Enable priority-base inliner and size inline by default for CSSPGO. + if (!ProfileSizeInline.getNumOccurrences()) + ProfileSizeInline = true; + if (!CallsitePrioritizedInline.getNumOccurrences()) + CallsitePrioritizedInline = true; + // Tracker for profiles under different context ContextTracker = std::make_unique<SampleContextTracker>(Reader->getProfiles()); @@ -2075,6 +2593,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { + LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n"); DILocation2SampleMap.clear(); // By default the entry count is initialized to -1, which will be treated // conservatively by getEntryCount as the same as unknown (None). This is diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp index 7cecd20b78d8..a885c3ee4ded 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -12,6 +12,7 @@ #include "llvm/Transforms/IPO/SampleProfileProbe.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -25,8 +26,10 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/CRC.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <unordered_set> #include <vector> using namespace llvm; @@ -35,6 +38,115 @@ using namespace llvm; STATISTIC(ArtificialDbgLine, "Number of probes that have an artificial debug line"); +static cl::opt<bool> + VerifyPseudoProbe("verify-pseudo-probe", cl::init(false), cl::Hidden, + cl::desc("Do pseudo probe verification")); + +static cl::list<std::string> VerifyPseudoProbeFuncList( + "verify-pseudo-probe-funcs", cl::Hidden, + cl::desc("The option to specify the name of the functions to verify.")); + +static cl::opt<bool> + UpdatePseudoProbe("update-pseudo-probe", cl::init(true), cl::Hidden, + cl::desc("Update pseudo probe distribution factor")); + +bool PseudoProbeVerifier::shouldVerifyFunction(const Function *F) { + // Skip function declaration. + if (F->isDeclaration()) + return false; + // Skip function that will not be emitted into object file. The prevailing + // defintion will be verified instead. + if (F->hasAvailableExternallyLinkage()) + return false; + // Do a name matching. + static std::unordered_set<std::string> VerifyFuncNames( + VerifyPseudoProbeFuncList.begin(), VerifyPseudoProbeFuncList.end()); + return VerifyFuncNames.empty() || VerifyFuncNames.count(F->getName().str()); +} + +void PseudoProbeVerifier::registerCallbacks(PassInstrumentationCallbacks &PIC) { + if (VerifyPseudoProbe) { + PIC.registerAfterPassCallback( + [this](StringRef P, Any IR, const PreservedAnalyses &) { + this->runAfterPass(P, IR); + }); + } +} + +// Callback to run after each transformation for the new pass manager. +void PseudoProbeVerifier::runAfterPass(StringRef PassID, Any IR) { + std::string Banner = + "\n*** Pseudo Probe Verification After " + PassID.str() + " ***\n"; + dbgs() << Banner; + if (any_isa<const Module *>(IR)) + runAfterPass(any_cast<const Module *>(IR)); + else if (any_isa<const Function *>(IR)) + runAfterPass(any_cast<const Function *>(IR)); + else if (any_isa<const LazyCallGraph::SCC *>(IR)) + runAfterPass(any_cast<const LazyCallGraph::SCC *>(IR)); + else if (any_isa<const Loop *>(IR)) + runAfterPass(any_cast<const Loop *>(IR)); + else + llvm_unreachable("Unknown IR unit"); +} + +void PseudoProbeVerifier::runAfterPass(const Module *M) { + for (const Function &F : *M) + runAfterPass(&F); +} + +void PseudoProbeVerifier::runAfterPass(const LazyCallGraph::SCC *C) { + for (const LazyCallGraph::Node &N : *C) + runAfterPass(&N.getFunction()); +} + +void PseudoProbeVerifier::runAfterPass(const Function *F) { + if (!shouldVerifyFunction(F)) + return; + ProbeFactorMap ProbeFactors; + for (const auto &BB : *F) + collectProbeFactors(&BB, ProbeFactors); + verifyProbeFactors(F, ProbeFactors); +} + +void PseudoProbeVerifier::runAfterPass(const Loop *L) { + const Function *F = L->getHeader()->getParent(); + runAfterPass(F); +} + +void PseudoProbeVerifier::collectProbeFactors(const BasicBlock *Block, + ProbeFactorMap &ProbeFactors) { + for (const auto &I : *Block) { + if (Optional<PseudoProbe> Probe = extractProbe(I)) + ProbeFactors[Probe->Id] += Probe->Factor; + } +} + +void PseudoProbeVerifier::verifyProbeFactors( + const Function *F, const ProbeFactorMap &ProbeFactors) { + bool BannerPrinted = false; + auto &PrevProbeFactors = FunctionProbeFactors[F->getName()]; + for (const auto &I : ProbeFactors) { + float CurProbeFactor = I.second; + if (PrevProbeFactors.count(I.first)) { + float PrevProbeFactor = PrevProbeFactors[I.first]; + if (std::abs(CurProbeFactor - PrevProbeFactor) > + DistributionFactorVariance) { + if (!BannerPrinted) { + dbgs() << "Function " << F->getName() << ":\n"; + BannerPrinted = true; + } + dbgs() << "Probe " << I.first << "\tprevious factor " + << format("%0.2f", PrevProbeFactor) << "\tcurrent factor " + << format("%0.2f", CurProbeFactor) << "\n"; + } + } + + // Update + PrevProbeFactors[I.first] = I.second; + } +} + PseudoProbeManager::PseudoProbeManager(const Module &M) { if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) { for (const auto *Operand : FuncInfo->operands()) { @@ -201,7 +313,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) { Function *ProbeFn = llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe); Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index), - Builder.getInt32(0)}; + Builder.getInt32(0), + Builder.getInt64(PseudoProbeFullDistributionFactor)}; auto *Probe = Builder.CreateCall(ProbeFn, Args); AssignDebugLoc(Probe); } @@ -219,7 +332,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) { // Levarge the 32-bit discriminator field of debug data to store the ID and // type of a callsite probe. This gets rid of the dependency on plumbing a // customized metadata through the codegen pipeline. - uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(Index, Type); + uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData( + Index, Type, 0, PseudoProbeDwarfDiscriminator::FullDistributionFactor); if (auto DIL = Call->getDebugLoc()) { DIL = DIL->cloneWithDiscriminator(V); Call->setDebugLoc(DIL); @@ -274,3 +388,47 @@ PreservedAnalyses SampleProfileProbePass::run(Module &M, return PreservedAnalyses::none(); } + +void PseudoProbeUpdatePass::runOnFunction(Function &F, + FunctionAnalysisManager &FAM) { + BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); + auto BBProfileCount = [&BFI](BasicBlock *BB) { + return BFI.getBlockProfileCount(BB) + ? BFI.getBlockProfileCount(BB).getValue() + : 0; + }; + + // Collect the sum of execution weight for each probe. + ProbeFactorMap ProbeFactors; + for (auto &Block : F) { + for (auto &I : Block) { + if (Optional<PseudoProbe> Probe = extractProbe(I)) + ProbeFactors[Probe->Id] += BBProfileCount(&Block); + } + } + + // Fix up over-counted probes. + for (auto &Block : F) { + for (auto &I : Block) { + if (Optional<PseudoProbe> Probe = extractProbe(I)) { + float Sum = ProbeFactors[Probe->Id]; + if (Sum != 0) + setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum); + } + } + } +} + +PreservedAnalyses PseudoProbeUpdatePass::run(Module &M, + ModuleAnalysisManager &AM) { + if (UpdatePseudoProbe) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + runOnFunction(F, FAM); + } + } + return PreservedAnalyses::none(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 0b53007bb6dc..07e68c44416d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1270,6 +1270,7 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) { ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0)); ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1)); if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() && + LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() && (transformZExtICmp(LHS, CI, false) || transformZExtICmp(RHS, CI, false))) { // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index d687ec654438..b211b0813611 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); for (++BBI; BBI != E; ++BBI) - if (BBI->mayWriteToMemory()) + if (BBI->mayWriteToMemory()) { + // Calls that only access inaccessible memory do not block sinking the + // load. + if (auto *CB = dyn_cast<CallBase>(BBI)) + if (CB->onlyAccessesInaccessibleMemory()) + continue; return false; + } // Check for non-address taken alloca. If not address-taken already, it isn't // profitable to do this xform. diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index c265516213aa..16efe863779a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -345,10 +345,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return false; // Get the constant out of the ICmp, if there is one. + // Only try this when exactly 1 operand is a constant (if both operands + // are constant, the icmp should eventually simplify). Otherwise, we may + // invert the transform that reduces set bits and infinite-loop. + Value *X; const APInt *CmpC; ICmpInst::Predicate Pred; - if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) || - CmpC->getBitWidth() != SelC->getBitWidth()) + if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) || + isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth()) return ShrinkDemandedConstant(I, OpNo, DemandedMask); // If the constant is already the same as the ICmp, leave it as-is. diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 518e909e8ab4..828fd49524ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3878,9 +3878,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, } } - // Skip processing debug intrinsics in InstCombine. Processing these call instructions - // consumes non-trivial amount of time and provides no value for the optimization. - if (!isa<DbgInfoIntrinsic>(Inst)) { + // Skip processing debug and pseudo intrinsics in InstCombine. Processing + // these call instructions consumes non-trivial amount of time and + // provides no value for the optimization. + if (!Inst->isDebugOrPseudoInst()) { InstrsForInstCombineWorklist.push_back(Inst); SeenAliasScopes.analyse(Inst); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp index 2b649732a799..ce4e5e575fbf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -325,7 +325,7 @@ void AggressiveDeadCodeElimination::initialize() { bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) { // TODO -- use llvm::isInstructionTriviallyDead - if (I.isEHPad() || I.mayHaveSideEffects()) { + if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) { // Skip any value profile instrumentation calls if they are // instrumenting constants. if (isInstrumentsConstant(I)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 96aef90c1c1a..10b08b4e2224 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -2076,6 +2076,15 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, ValueMapping[PN] = NewPN; } + // Clone noalias scope declarations in the threaded block. When threading a + // loop exit, we would otherwise end up with two idential scope declarations + // visible at the same time. + SmallVector<MDNode *> NoAliasScopes; + DenseMap<MDNode *, MDNode *> ClonedScopes; + LLVMContext &Context = PredBB->getContext(); + identifyNoAliasScopesToClone(BI, BE, NoAliasScopes); + cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context); + // Clone the non-phi instructions of the source basic block into NewBB, // keeping track of the mapping and using it to remap operands in the cloned // instructions. @@ -2084,6 +2093,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, New->setName(BI->getName()); NewBB->getInstList().push_back(New); ValueMapping[&*BI] = New; + adaptNoAliasScopes(New, ClonedScopes, Context); // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 18717394d384..822a786fc7c7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -1114,12 +1114,16 @@ void LoopUnswitch::emitPreheaderBranchOnCondition( Loop *L = LI->getLoopFor(I->getParent()); auto *DefiningAccess = MemA->getDefiningAccess(); - // If the defining access is a MemoryPhi in the header, get the incoming - // value for the pre-header as defining access. - if (DefiningAccess->getBlock() == I->getParent()) { + // Get the first defining access before the loop. + while (L->contains(DefiningAccess->getBlock())) { + // If the defining access is a MemoryPhi, get the incoming + // value for the pre-header as defining access. if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) { DefiningAccess = MemPhi->getIncomingValueForBlock(L->getLoopPreheader()); + } else { + DefiningAccess = + cast<MemoryDef>(DefiningAccess)->getDefiningAccess(); } } MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(), diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index d111a6ba4241..af510f1a84bf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2524,7 +2524,7 @@ private: NewAI.getAlign(), LI.isVolatile(), LI.getName()); if (AATags) - NewLI->setAAMetadata(AATags); + NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); if (NewLI->isAtomic()) @@ -2563,7 +2563,7 @@ private: IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(), LI.isVolatile(), LI.getName()); if (AATags) - NewLI->setAAMetadata(AATags); + NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); @@ -2626,7 +2626,7 @@ private: } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); if (AATags) - Store->setAAMetadata(AATags); + Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); Pass.DeadInsts.push_back(&SI); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -2650,7 +2650,7 @@ private: Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - Store->setAAMetadata(AATags); + Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); Pass.DeadInsts.push_back(&SI); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); return true; @@ -2720,7 +2720,7 @@ private: NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - NewSI->setAAMetadata(AATags); + NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); if (SI.isVolatile()) NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); if (NewSI->isAtomic()) @@ -2816,7 +2816,7 @@ private: getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, MaybeAlign(getSliceAlign()), II.isVolatile()); if (AATags) - New->setAAMetadata(AATags); + New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return false; } @@ -2885,7 +2885,7 @@ private: StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile()); if (AATags) - New->setAAMetadata(AATags); + New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return !II.isVolatile(); } @@ -3006,7 +3006,7 @@ private: CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign, Size, II.isVolatile()); if (AATags) - New->setAAMetadata(AATags); + New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return false; } @@ -3060,7 +3060,7 @@ private: LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign, II.isVolatile(), "copyload"); if (AATags) - Load->setAAMetadata(AATags); + Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); Src = Load; } @@ -3080,7 +3080,7 @@ private: StoreInst *Store = cast<StoreInst>( IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); if (AATags) - Store->setAAMetadata(AATags); + Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); return !II.isVolatile(); } @@ -3381,8 +3381,13 @@ private: IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); LoadInst *Load = IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load"); - if (AATags) - Load->setAAMetadata(AATags); + + APInt Offset( + DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); + if (AATags && + GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) + Load->setAAMetadata(AATags.shift(Offset.getZExtValue())); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); } @@ -3428,8 +3433,13 @@ private: IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); StoreInst *Store = IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment); - if (AATags) - Store->setAAMetadata(AATags); + + APInt Offset( + DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); + if (AATags && + GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) + Store->setAAMetadata(AATags.shift(Offset.getZExtValue())); + LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index f4afa3ad4623..dba5403f272a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -170,16 +170,6 @@ static bool setRetAndArgsNoUndef(Function &F) { return setRetNoUndef(F) | setArgsNoUndef(F); } -static bool setRetNonNull(Function &F) { - assert(F.getReturnType()->isPointerTy() && - "nonnull applies only to pointers"); - if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull)) - return false; - F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); - ++NumNonNull; - return true; -} - static bool setReturnedArg(Function &F, unsigned ArgNo) { if (F.hasParamAttribute(ArgNo, Attribute::Returned)) return false; @@ -1005,63 +995,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); return Changed; - case LibFunc_ZdlPvRKSt9nothrow_t: // delete(void*, nothrow) - case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t: // delete(void*, align_val_t, nothrow) - case LibFunc_ZdaPvRKSt9nothrow_t: // delete[](void*, nothrow) - case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t: // delete[](void*, align_val_t, nothrow) - Changed |= setDoesNotThrow(F); - LLVM_FALLTHROUGH; - case LibFunc_ZdlPv: // delete(void*) - case LibFunc_ZdlPvj: // delete(void*, unsigned int) - case LibFunc_ZdlPvm: // delete(void*, unsigned long) - case LibFunc_ZdaPv: // delete[](void*) - case LibFunc_ZdaPvj: // delete[](void*, unsigned int) - case LibFunc_ZdaPvm: // delete[](void*, unsigned long) - case LibFunc_ZdlPvSt11align_val_t: // delete(void*, align_val_t) - case LibFunc_ZdlPvjSt11align_val_t: // delete(void*, unsigned int, align_val_t) - case LibFunc_ZdlPvmSt11align_val_t: // delete(void*, unsigned long, align_val_t) - case LibFunc_ZdaPvSt11align_val_t: // delete[](void*, align_val_t) - case LibFunc_ZdaPvjSt11align_val_t: // delete[](void*, unsigned int, align_val_t) - case LibFunc_ZdaPvmSt11align_val_t: // delete[](void*, unsigned long, align_val_t); - Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); - Changed |= setArgsNoUndef(F); - Changed |= setWillReturn(F); - Changed |= setDoesNotCapture(F, 0); - return Changed; - case LibFunc_ZnwjRKSt9nothrow_t: // new(unsigned int, nothrow) - case LibFunc_ZnwmRKSt9nothrow_t: // new(unsigned long, nothrow) - case LibFunc_ZnajRKSt9nothrow_t: // new[](unsigned int, nothrow) - case LibFunc_ZnamRKSt9nothrow_t: // new[](unsigned long, nothrow) - case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t: // new(unsigned int, align_val_t, nothrow) - case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: // new(unsigned long, align_val_t, nothrow) - case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t: // new[](unsigned int, align_val_t, nothrow) - case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: // new[](unsigned long, align_val_t, nothrow) - // Nothrow operator new may return null pointer - Changed |= setDoesNotThrow(F); - Changed |= setOnlyAccessesInaccessibleMemory(F); - Changed |= setRetNoUndef(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; - case LibFunc_Znwj: // new(unsigned int) - case LibFunc_Znwm: // new(unsigned long) - case LibFunc_Znaj: // new[](unsigned int) - case LibFunc_Znam: // new[](unsigned long) - case LibFunc_ZnwjSt11align_val_t: // new(unsigned int, align_val_t) - case LibFunc_ZnwmSt11align_val_t: // new(unsigned long, align_val_t) - case LibFunc_ZnajSt11align_val_t: // new[](unsigned int, align_val_t) - case LibFunc_ZnamSt11align_val_t: // new[](unsigned long, align_val_t) - case LibFunc_msvc_new_int: // new(unsigned int) - case LibFunc_msvc_new_longlong: // new(unsigned long long) - case LibFunc_msvc_new_array_int: // new[](unsigned int) - case LibFunc_msvc_new_array_longlong: // new[](unsigned long long) - Changed |= setOnlyAccessesInaccessibleMemory(F); - // Operator new always returns a nonnull noalias pointer - Changed |= setRetNoUndef(F); - Changed |= setRetNonNull(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; // TODO: add LibFunc entries for: // case LibFunc_memset_pattern4: // case LibFunc_memset_pattern8: diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp index 51a49574e55d..6ab061510a60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -989,3 +989,11 @@ void llvm::identifyNoAliasScopesToClone( if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) NoAliasDeclScopes.push_back(Decl->getScopeList()); } + +void llvm::identifyNoAliasScopesToClone( + BasicBlock::iterator Start, BasicBlock::iterator End, + SmallVectorImpl<MDNode *> &NoAliasDeclScopes) { + for (Instruction &I : make_range(Start, End)) + if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) + NoAliasDeclScopes.push_back(Decl->getScopeList()); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp index 0ac8fa537f4e..3026342cc4a6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -921,14 +921,20 @@ void ScopedAliasMetadataDeepCloner::remap(ValueToValueMapTy &VMap) { if (!I) continue; + // Only update scopes when we find them in the map. If they are not, it is + // because we already handled that instruction before. This is faster than + // tracking which instructions we already updated. if (MDNode *M = I->getMetadata(LLVMContext::MD_alias_scope)) - I->setMetadata(LLVMContext::MD_alias_scope, MDMap[M]); + if (MDNode *MNew = MDMap.lookup(M)) + I->setMetadata(LLVMContext::MD_alias_scope, MNew); if (MDNode *M = I->getMetadata(LLVMContext::MD_noalias)) - I->setMetadata(LLVMContext::MD_noalias, MDMap[M]); + if (MDNode *MNew = MDMap.lookup(M)) + I->setMetadata(LLVMContext::MD_noalias, MNew); if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I)) - Decl->setScopeList(MDMap[Decl->getScopeList()]); + if (MDNode *MNew = MDMap.lookup(Decl->getScopeList())) + Decl->setScopeList(MNew); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp index 477ea458c763..ae26058c210c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp @@ -420,13 +420,8 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, return true; } - if (auto *CB = dyn_cast<CallBase>(I)) { - // Treat calls that may not return as alive. - // TODO: Remove the intrinsic escape hatch once all intrinsics set - // willreturn properly. - if (!CB->willReturn() && !isa<IntrinsicInst>(I)) - return false; - } + if (!I->willReturn()) + return false; if (!I->mayHaveSideEffects()) return true; @@ -923,6 +918,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN, /// \param IncomingValues A map from block to value. static void replaceUndefValuesInPhi(PHINode *PN, const IncomingValueMap &IncomingValues) { + SmallVector<unsigned> TrueUndefOps; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); @@ -930,10 +926,31 @@ static void replaceUndefValuesInPhi(PHINode *PN, BasicBlock *BB = PN->getIncomingBlock(i); IncomingValueMap::const_iterator It = IncomingValues.find(BB); - if (It == IncomingValues.end()) continue; + // Keep track of undef/poison incoming values. Those must match, so we fix + // them up below if needed. + // Note: this is conservatively correct, but we could try harder and group + // the undef values per incoming basic block. + if (It == IncomingValues.end()) { + TrueUndefOps.push_back(i); + continue; + } + + // There is a defined value for this incoming block, so map this undef + // incoming value to the defined value. PN->setIncomingValue(i, It->second); } + + // If there are both undef and poison values incoming, then convert those + // values to undef. It is invalid to have different values for the same + // incoming block. + unsigned PoisonCount = count_if(TrueUndefOps, [&](unsigned i) { + return isa<PoisonValue>(PN->getIncomingValue(i)); + }); + if (PoisonCount != 0 && PoisonCount != TrueUndefOps.size()) { + for (unsigned i : TrueUndefOps) + PN->setIncomingValue(i, UndefValue::get(PN->getType())); + } } /// Replace a value flowing from a block to a phi with diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp index cb5fee7d28e6..befacb591762 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -509,7 +509,7 @@ static void cloneLoopBlocks( SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges, SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, - LoopInfo *LI) { + LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) { BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); BasicBlock *PreHeader = L->getLoopPreheader(); @@ -545,6 +545,15 @@ static void cloneLoopBlocks( } } + { + // Identify what other metadata depends on the cloned version. After + // cloning, replace the metadata with the corrected version for both + // memory instructions and noalias intrinsics. + std::string Ext = (Twine("Peel") + Twine(IterNumber)).str(); + cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks, + Header->getContext(), Ext); + } + // Recursively create the new Loop objects for nested loops, if any, // to preserve LoopInfo. for (Loop *ChildLoop : *L) { @@ -769,13 +778,19 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, uint64_t ExitWeight = 0, FallThroughWeight = 0; initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight); + // Identify what noalias metadata is inside the loop: if it is inside the + // loop, the associated metadata must be cloned for each iteration. + SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes; + identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes); + // For each peeled-off iteration, make a copy of the loop. for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { SmallVector<BasicBlock *, 8> NewBlocks; ValueToValueMapTy VMap; cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks, - LoopBlocks, VMap, LVMap, DT, LI); + LoopBlocks, VMap, LVMap, DT, LI, + LoopLocalNoAliasDeclScopes); // Remap to use values from the current iteration instead of the // previous one. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 7cfe17618cde..de9560df9785 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1628,6 +1628,11 @@ static bool canSinkInstructions( I->getType()->isTokenTy()) return false; + // Do not try to sink an instruction in an infinite loop - it can cause + // this algorithm to infinite loop. + if (I->getParent()->getSingleSuccessor() == I->getParent()) + return false; + // Conservatively return false if I is an inline-asm instruction. Sinking // and merging inline-asm instructions can potentially create arguments // that cannot satisfy the inline-asm constraints. @@ -1714,13 +1719,13 @@ static bool canSinkInstructions( return true; } -// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last +// Assuming canSinkInstructions(Blocks) has returned true, sink the last // instruction of every block in Blocks to their common successor, commoning // into one instruction. static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) { auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0); - // canSinkLastInstruction returning true guarantees that every block has at + // canSinkInstructions returning true guarantees that every block has at // least one non-terminator instruction. SmallVector<Instruction*,4> Insts; for (auto *BB : Blocks) { @@ -1733,9 +1738,9 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) { } // The only checking we need to do now is that all users of all instructions - // are the same PHI node. canSinkLastInstruction should have checked this but - // it is slightly over-aggressive - it gets confused by commutative instructions - // so double-check it here. + // are the same PHI node. canSinkInstructions should have checked this but + // it is slightly over-aggressive - it gets confused by commutative + // instructions so double-check it here. Instruction *I0 = Insts.front(); if (!I0->user_empty()) { auto *PNUse = dyn_cast<PHINode>(*I0->user_begin()); @@ -1746,11 +1751,11 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) { return false; } - // We don't need to do any more checking here; canSinkLastInstruction should + // We don't need to do any more checking here; canSinkInstructions should // have done it all for us. SmallVector<Value*, 4> NewOperands; for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { - // This check is different to that in canSinkLastInstruction. There, we + // This check is different to that in canSinkInstructions. There, we // cared about the global view once simplifycfg (and instcombine) have // completed - it takes into account PHIs that become trivially // simplifiable. However here we need a more local view; if an operand diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1795470fa58c..19797e6f7858 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -142,6 +142,10 @@ public: return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); } + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ea0d7673edf6..b456a97aa4ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -372,19 +372,11 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an -/// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { - // Determine if an array of VF elements of type Ty is "bitcast compatible" - // with a <VF x Ty> vector. - if (VF.isVector()) { - auto *VectorTy = VectorType::get(Ty, VF); - return TypeSize::get(VF.getKnownMinValue() * - DL.getTypeAllocSize(Ty).getFixedValue(), - VF.isScalable()) != DL.getTypeStoreSize(VectorTy); - } - - // If the vectorization factor is one, we just check if an array of type Ty - // requires padding between elements. +/// element of the corresponding vector type. +static bool hasIrregularType(Type *Ty, const DataLayout &DL) { + // Determine if an array of N elements of type Ty is "bitcast compatible" + // with a <N x Ty> vector. + // This is only true if there is no padding between the array elements. return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } @@ -5212,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = getMemInstValueType(I); - if (hasIrregularType(ScalarTy, DL, VF)) + if (hasIrregularType(ScalarTy, DL)) return false; // Check if masking is required. @@ -5259,7 +5251,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); - if (hasIrregularType(ScalarTy, DL, VF)) + if (hasIrregularType(ScalarTy, DL)) return false; return true; @@ -5504,11 +5496,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return None; } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return MaxVF; + return computeFeasibleMaxVF(TC, UserVF); case CM_ScalarEpilogueNotAllowedUsePredicate: LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5546,7 +5536,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return computeFeasibleMaxVF(TC, UserVF); } return None; } @@ -5563,6 +5553,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); assert(!MaxVF.isScalable() && "Scalable vectors do not yet support tail folding"); assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && @@ -8196,8 +8187,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask); - if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. - EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The condition is 'SrcMask && EdgeMask', which is equivalent to + // 'select i1 SrcMask, i1 EdgeMask, i1 false'. + // The select version does not introduce new UB if SrcMask is false and + // EdgeMask is poison. Using 'and' here introduces undefined behavior. + VPValue *False = Plan->getOrAddVPValue( + ConstantInt::getFalse(BI->getCondition()->getType())); + EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + } return EdgeMaskCache[Edge] = EdgeMask; } |