aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp118
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp713
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp162
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp67
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp40
20 files changed, 1029 insertions, 260 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 30a1f81ad0e1..6730824e860a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
if (isNoModRef(MRI))
continue;
+ // A pseudo probe call shouldn't change any function attribute since it
+ // doesn't translate to a real instruction. It comes with a memory access
+ // tag to prevent itself being removed by optimizations and not block
+ // other instructions being optimized.
+ if (isa<PseudoProbeInst>(I))
+ continue;
+
if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
// The call could access any memory. If that includes writes, note it.
if (isModSet(MRI))
@@ -1445,8 +1452,7 @@ static bool functionWillReturn(const Function &F) {
// If there are no loops, then the function is willreturn if all calls in
// it are willreturn.
return all_of(instructions(F), [](const Instruction &I) {
- const auto *CB = dyn_cast<CallBase>(&I);
- return !CB || CB->hasFnAttr(Attribute::WillReturn);
+ return I.willReturn();
});
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 37fc27e91100..158fa0771c3b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -30,7 +30,7 @@ namespace llvm {
ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
StringRef CalleeName) {
if (CalleeName.empty())
- return getChildContext(CallSite);
+ return getHottestChildContext(CallSite);
uint32_t Hash = nodeHash(CalleeName, CallSite);
auto It = AllChildContext.find(Hash);
@@ -40,18 +40,22 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
}
ContextTrieNode *
-ContextTrieNode::getChildContext(const LineLocation &CallSite) {
+ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
// CSFDO-TODO: This could be slow, change AllChildContext so we can
// do point look up for child node by call site alone.
- // CSFDO-TODO: Return the child with max count for indirect call
+ // Retrieve the child node with max count for indirect call
ContextTrieNode *ChildNodeRet = nullptr;
+ uint64_t MaxCalleeSamples = 0;
for (auto &It : AllChildContext) {
ContextTrieNode &ChildNode = It.second;
- if (ChildNode.CallSiteLoc == CallSite) {
- if (ChildNodeRet)
- return nullptr;
- else
- ChildNodeRet = &ChildNode;
+ if (ChildNode.CallSiteLoc != CallSite)
+ continue;
+ FunctionSamples *Samples = ChildNode.getFunctionSamples();
+ if (!Samples)
+ continue;
+ if (Samples->getTotalSamples() > MaxCalleeSamples) {
+ ChildNodeRet = &ChildNode;
+ MaxCalleeSamples = Samples->getTotalSamples();
}
}
@@ -179,7 +183,7 @@ SampleContextTracker::SampleContextTracker(
SampleContext Context(FuncSample.first(), RawContext);
LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
if (!Context.isBaseContext())
- FuncToCtxtProfileSet[Context.getName()].insert(FSamples);
+ FuncToCtxtProfileSet[Context.getNameWithoutContext()].insert(FSamples);
ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
assert(!NewNode->getFunctionSamples() &&
"New node can't have sample profile");
@@ -191,12 +195,12 @@ FunctionSamples *
SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
StringRef CalleeName) {
LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n");
- // CSFDO-TODO: We use CalleeName to differentiate indirect call
- // We need to get sample for indirect callee too.
DILocation *DIL = Inst.getDebugLoc();
if (!DIL)
return nullptr;
+ // For indirect call, CalleeName will be empty, in which case the context
+ // profile for callee with largest total samples will be returned.
ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName);
if (CalleeContext) {
FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
@@ -209,6 +213,26 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
return nullptr;
}
+std::vector<const FunctionSamples *>
+SampleContextTracker::getIndirectCalleeContextSamplesFor(
+ const DILocation *DIL) {
+ std::vector<const FunctionSamples *> R;
+ if (!DIL)
+ return R;
+
+ ContextTrieNode *CallerNode = getContextFor(DIL);
+ LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+ for (auto &It : CallerNode->getAllChildContext()) {
+ ContextTrieNode &ChildNode = It.second;
+ if (ChildNode.getCallSiteLoc() != CallSite)
+ continue;
+ if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples())
+ R.push_back(CalleeSamples);
+ }
+
+ return R;
+}
+
FunctionSamples *
SampleContextTracker::getContextSamplesFor(const DILocation *DIL) {
assert(DIL && "Expect non-null location");
@@ -239,6 +263,17 @@ SampleContextTracker::getContextSamplesFor(const SampleContext &Context) {
return Node->getFunctionSamples();
}
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(const Function &Func) {
+ StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
+ return FuncToCtxtProfileSet[CanonName];
+}
+
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(StringRef Name) {
+ return FuncToCtxtProfileSet[Name];
+}
+
FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
bool MergeContext) {
StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
@@ -295,11 +330,6 @@ void SampleContextTracker::promoteMergeContextSamplesTree(
const Instruction &Inst, StringRef CalleeName) {
LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
<< Inst << "\n");
- // CSFDO-TODO: We also need to promote context profile from indirect
- // calls. We won't have callee names from those from call instr.
- if (CalleeName.empty())
- return;
-
// Get the caller context for the call instruction, we don't use callee
// name from call because there can be context from indirect calls too.
DILocation *DIL = Inst.getDebugLoc();
@@ -308,8 +338,23 @@ void SampleContextTracker::promoteMergeContextSamplesTree(
return;
// Get the context that needs to be promoted
- LineLocation CallSite(FunctionSamples::getOffset(DIL),
- DIL->getBaseDiscriminator());
+ LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+ // For indirect call, CalleeName will be empty, in which case we need to
+ // promote all non-inlined child context profiles.
+ if (CalleeName.empty()) {
+ for (auto &It : CallerNode->getAllChildContext()) {
+ ContextTrieNode *NodeToPromo = &It.second;
+ if (CallSite != NodeToPromo->getCallSiteLoc())
+ continue;
+ FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples();
+ if (FromSamples && FromSamples->getContext().hasState(InlinedContext))
+ continue;
+ promoteMergeContextSamplesTree(*NodeToPromo);
+ }
+ return;
+ }
+
+ // Get the context for the given callee that needs to be promoted
ContextTrieNode *NodeToPromo =
CallerNode->getChildContext(CallSite, CalleeName);
if (!NodeToPromo)
@@ -329,6 +374,8 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
LLVM_DEBUG(dbgs() << " Found context tree root to promote: "
<< FromSamples->getContext() << "\n");
+ assert(!FromSamples->getContext().hasState(InlinedContext) &&
+ "Shouldn't promote inlined context profile");
StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
ContextStrToRemove);
@@ -361,18 +408,14 @@ SampleContextTracker::getCalleeContextFor(const DILocation *DIL,
StringRef CalleeName) {
assert(DIL && "Expect non-null location");
- // CSSPGO-TODO: need to support indirect callee
- if (CalleeName.empty())
- return nullptr;
-
ContextTrieNode *CallContext = getContextFor(DIL);
if (!CallContext)
return nullptr;
+ // When CalleeName is empty, the child context profile with max
+ // total samples will be returned.
return CallContext->getChildContext(
- LineLocation(FunctionSamples::getOffset(DIL),
- DIL->getBaseDiscriminator()),
- CalleeName);
+ FunctionSamples::getCallSiteIdentifier(DIL), CalleeName);
}
ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
@@ -386,8 +429,8 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
if (Name.empty())
Name = PrevDIL->getScope()->getSubprogram()->getName();
S.push_back(
- std::make_pair(LineLocation(FunctionSamples::getOffset(DIL),
- DIL->getBaseDiscriminator()), Name));
+ std::make_pair(FunctionSamples::getCallSiteIdentifier(DIL),
+ PrevDIL->getScope()->getSubprogram()->getLinkageName()));
PrevDIL = DIL;
}
@@ -518,4 +561,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
return *ToNode;
}
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleContextTracker::addCallGraphEdges(CallGraph &CG,
+ StringMap<Function *> &SymbolMap) {
+ // Add profile call edges to the call graph.
+ std::queue<ContextTrieNode *> NodeQueue;
+ NodeQueue.push(&RootContext);
+ while (!NodeQueue.empty()) {
+ ContextTrieNode *Node = NodeQueue.front();
+ NodeQueue.pop();
+ Function *F = SymbolMap.lookup(Node->getFuncName());
+ for (auto &I : Node->getAllChildContext()) {
+ ContextTrieNode *ChildNode = &I.second;
+ NodeQueue.push(ChildNode);
+ if (F && !F->isDeclaration()) {
+ Function *Callee = SymbolMap.lookup(ChildNode->getFuncName());
+ if (Callee && !Callee->isDeclaration())
+ CG[F]->addCalledFunction(nullptr, CG[Callee]);
+ }
+ }
+ }
+}
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 264ac4065e8c..a6a419bfe742 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/None.h"
+#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
@@ -107,6 +108,16 @@ STATISTIC(NumCSNotInlined,
STATISTIC(NumMismatchedProfile,
"Number of functions with CFG mismatched profile");
STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
+STATISTIC(NumDuplicatedInlinesite,
+ "Number of inlined callsites with a partial distribution factor");
+
+STATISTIC(NumCSInlinedHitMinLimit,
+ "Number of functions with FDO inline stopped due to min size limit");
+STATISTIC(NumCSInlinedHitMaxLimit,
+ "Number of functions with FDO inline stopped due to max size limit");
+STATISTIC(
+ NumCSInlinedHitGrowthLimit,
+ "Number of functions with FDO inline stopped due to growth size limit");
// Command line option to specify the file to read samples from. This is
// mainly used for debugging.
@@ -166,11 +177,53 @@ static cl::opt<bool> ProfileTopDownLoad(
"order of call graph during sample profile loading. It only "
"works for new pass manager. "));
+static cl::opt<bool> UseProfileIndirectCallEdges(
+ "use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
+ cl::desc("Considering indirect call samples from profile when top-down "
+ "processing functions. Only CSSPGO is supported."));
+
+static cl::opt<bool> UseProfileTopDownOrder(
+ "use-profile-top-down-order", cl::init(false), cl::Hidden,
+ cl::desc("Process functions in one SCC in a top-down order "
+ "based on the input profile."));
+
static cl::opt<bool> ProfileSizeInline(
"sample-profile-inline-size", cl::Hidden, cl::init(false),
cl::desc("Inline cold call sites in profile loader if it's beneficial "
"for code size."));
+static cl::opt<int> ProfileInlineGrowthLimit(
+ "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
+ cl::desc("The size growth ratio limit for proirity-based sample profile "
+ "loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMin(
+ "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
+ cl::desc("The lower bound of size growth limit for "
+ "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMax(
+ "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
+ cl::desc("The upper bound of size growth limit for "
+ "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileICPThreshold(
+ "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
+ cl::desc(
+ "Relative hotness threshold for indirect "
+ "call promotion in proirity-based sample profile loader inlining."));
+
+static cl::opt<int> SampleHotCallSiteThreshold(
+ "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
+ cl::desc("Hot callsite threshold for proirity-based sample profile loader "
+ "inlining."));
+
+static cl::opt<bool> CallsitePrioritizedInline(
+ "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
+ cl::init(false),
+ cl::desc("Use call site prioritized inlining for sample profile loader."
+ "Currently only CSSPGO is supported."));
+
static cl::opt<int> SampleColdCallSiteThreshold(
"sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining cold callsites"));
@@ -313,6 +366,38 @@ private:
DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
};
+// Inline candidate used by iterative callsite prioritized inliner
+struct InlineCandidate {
+ CallBase *CallInstr;
+ const FunctionSamples *CalleeSamples;
+ // Prorated callsite count, which will be used to guide inlining. For example,
+ // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
+ // copies will get their own distribution factors and their prorated counts
+ // will be used to decide if they should be inlined independently.
+ uint64_t CallsiteCount;
+ // Call site distribution factor to prorate the profile samples for a
+ // duplicated callsite. Default value is 1.0.
+ float CallsiteDistribution;
+};
+
+// Inline candidate comparer using call site weight
+struct CandidateComparer {
+ bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
+ if (LHS.CallsiteCount != RHS.CallsiteCount)
+ return LHS.CallsiteCount < RHS.CallsiteCount;
+
+ // Tie breaker using GUID so we have stable/deterministic inlining order
+ assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+ "Expect non-null FunctionSamples");
+ return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
+ RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
+ }
+};
+
+using CandidateQueue =
+ PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
+ CandidateComparer>;
+
/// Sample profile pass.
///
/// This pass reads profile data from the file specified by
@@ -350,9 +435,21 @@ protected:
findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
const FunctionSamples *findFunctionSamples(const Instruction &I) const;
- bool inlineCallInstruction(CallBase &CB);
+ // Attempt to promote indirect call and also inline the promoted call
+ bool tryPromoteAndInlineCandidate(
+ Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
+ uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns,
+ SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
bool inlineHotFunctions(Function &F,
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+ InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
+ bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
+ bool
+ tryInlineCandidate(InlineCandidate &Candidate,
+ SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
+ bool
+ inlineHotFunctionsWithPriority(Function &F,
+ DenseSet<GlobalValue::GUID> &InlinedGUIDs);
// Inline cold/small functions in addition to hot ones
bool shouldInlineColdCallee(CallBase &CallInst);
void emitOptimizationRemarksForInlineCandidates(
@@ -371,6 +468,8 @@ protected:
uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
void buildEdges(Function &F);
std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
+ void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
+ void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
void computeDominanceAndLoopInfo(Function &F);
void clearFunctionData();
@@ -808,7 +907,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
if (R) {
- uint64_t Samples = R.get();
+ uint64_t Samples = R.get() * Probe->Factor;
bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
if (FirstMark) {
ORE->emit([&]() {
@@ -816,13 +915,17 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
Remark << "Applied " << ore::NV("NumSamples", Samples);
Remark << " samples from profile (ProbeId=";
Remark << ore::NV("ProbeId", Probe->Id);
+ Remark << ", Factor=";
+ Remark << ore::NV("Factor", Probe->Factor);
+ Remark << ", OriginalSamples=";
+ Remark << ore::NV("OriginalSamples", R.get());
Remark << ")";
return Remark;
});
}
-
LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst
- << " - weight: " << R.get() << ")\n");
+ << " - weight: " << R.get() << " - factor: "
+ << format("%0.2f", Probe->Factor) << ")\n");
return Samples;
}
return R;
@@ -918,6 +1021,31 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
return R;
}
+ auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
+ assert(L && R && "Expect non-null FunctionSamples");
+ if (L->getEntrySamples() != R->getEntrySamples())
+ return L->getEntrySamples() > R->getEntrySamples();
+ return FunctionSamples::getGUID(L->getName()) <
+ FunctionSamples::getGUID(R->getName());
+ };
+
+ if (ProfileIsCS) {
+ auto CalleeSamples =
+ ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
+ if (CalleeSamples.empty())
+ return R;
+
+ // For CSSPGO, we only use target context profile's entry count
+ // as that already includes both inlined callee and non-inlined ones..
+ Sum = 0;
+ for (const auto *const FS : CalleeSamples) {
+ Sum += FS->getEntrySamples();
+ R.push_back(FS);
+ }
+ llvm::sort(R, FSCompare);
+ return R;
+ }
+
const FunctionSamples *FS = findFunctionSamples(Inst);
if (FS == nullptr)
return R;
@@ -935,12 +1063,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
Sum += NameFS.second.getEntrySamples();
R.push_back(&NameFS.second);
}
- llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
- if (L->getEntrySamples() != R->getEntrySamples())
- return L->getEntrySamples() > R->getEntrySamples();
- return FunctionSamples::getGUID(L->getName()) <
- FunctionSamples::getGUID(R->getName());
- });
+ llvm::sort(R, FSCompare);
}
return R;
}
@@ -977,42 +1100,64 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
return it.first->second;
}
-bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
- if (ExternalInlineAdvisor) {
- auto Advice = ExternalInlineAdvisor->getAdvice(CB);
- if (!Advice->isInliningRecommended()) {
- Advice->recordUnattemptedInlining();
- return false;
+/// Attempt to promote indirect call and also inline the promoted call.
+///
+/// \param F Caller function.
+/// \param Candidate ICP and inline candidate.
+/// \param Sum Sum of target counts for indirect call.
+/// \param PromotedInsns Map to keep track of indirect call already processed.
+/// \param Candidate ICP and inline candidate.
+/// \param InlinedCallSite Output vector for new call sites exposed after
+/// inlining.
+bool SampleProfileLoader::tryPromoteAndInlineCandidate(
+ Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
+ DenseSet<Instruction *> &PromotedInsns,
+ SmallVector<CallBase *, 8> *InlinedCallSite) {
+ const char *Reason = "Callee function not available";
+ // R->getValue() != &F is to prevent promoting a recursive call.
+ // If it is a recursive call, we do not inline it as it could bloat
+ // the code exponentially. There is way to better handle this, e.g.
+ // clone the caller first, and inline the cloned caller if it is
+ // recursive. As llvm does not inline recursive calls, we will
+ // simply ignore it instead of handling it explicitly.
+ auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName());
+ if (R != SymbolMap.end() && R->getValue() &&
+ !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
+ R->getValue()->hasFnAttribute("use-sample-profile") &&
+ R->getValue() != &F &&
+ isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) {
+ auto *DI =
+ &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(),
+ Candidate.CallsiteCount, Sum, false, ORE);
+ if (DI) {
+ Sum -= Candidate.CallsiteCount;
+ // Prorate the indirect callsite distribution.
+ // Do not update the promoted direct callsite distribution at this
+ // point since the original distribution combined with the callee
+ // profile will be used to prorate callsites from the callee if
+ // inlined. Once not inlined, the direct callsite distribution should
+ // be prorated so that the it will reflect the real callsite counts.
+ setProbeDistributionFactor(*Candidate.CallInstr,
+ Candidate.CallsiteDistribution * Sum /
+ SumOrigin);
+ PromotedInsns.insert(Candidate.CallInstr);
+ Candidate.CallInstr = DI;
+ if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
+ bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
+ if (!Inlined) {
+ // Prorate the direct callsite distribution so that it reflects real
+ // callsite counts.
+ setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution *
+ Candidate.CallsiteCount /
+ SumOrigin);
+ }
+ return Inlined;
+ }
}
- // Dummy record, we don't use it for replay.
- Advice->recordInlining();
- }
-
- Function *CalledFunction = CB.getCalledFunction();
- assert(CalledFunction);
- DebugLoc DLoc = CB.getDebugLoc();
- BasicBlock *BB = CB.getParent();
- InlineParams Params = getInlineParams();
- Params.ComputeFullInlineCost = true;
- // Checks if there is anything in the reachable portion of the callee at
- // this callsite that makes this inlining potentially illegal. Need to
- // set ComputeFullInlineCost, otherwise getInlineCost may return early
- // when cost exceeds threshold without checking all IRs in the callee.
- // The acutal cost does not matter because we only checks isNever() to
- // see if it is legal to inline the callsite.
- InlineCost Cost =
- getInlineCost(CB, Params, GetTTI(*CalledFunction), GetAC, GetTLI);
- if (Cost.isNever()) {
- ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
- << "incompatible inlining");
- return false;
- }
- InlineFunctionInfo IFI(nullptr, GetAC);
- if (InlineFunction(CB, IFI).isSuccess()) {
- // The call to InlineFunction erases I, so we can't pass it here.
- emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
- true, CSINLINE_DEBUG);
- return true;
+ } else {
+ LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
+ << Candidate.CalleeSamples->getFuncName() << " because "
+ << Reason << "\n");
}
return false;
}
@@ -1078,10 +1223,11 @@ bool SampleProfileLoader::inlineHotFunctions(
"ProfAccForSymsInList should be false when profile-sample-accurate "
"is enabled");
- DenseMap<CallBase *, const FunctionSamples *> localNotInlinedCallSites;
+ DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
bool Changed = false;
- while (true) {
- bool LocalChanged = false;
+ bool LocalChanged = true;
+ while (LocalChanged) {
+ LocalChanged = false;
SmallVector<CallBase *, 10> CIS;
for (auto &BB : F) {
bool Hot = false;
@@ -1095,7 +1241,7 @@ bool SampleProfileLoader::inlineHotFunctions(
"GUIDToFuncNameMap has to be populated");
AllCandidates.push_back(CB);
if (FS->getEntrySamples() > 0 || ProfileIsCS)
- localNotInlinedCallSites.try_emplace(CB, FS);
+ LocalNotInlinedCallSites.try_emplace(CB, FS);
if (callsiteIsHot(FS, PSI))
Hot = true;
else if (shouldInlineColdCallee(*CB))
@@ -1113,6 +1259,11 @@ bool SampleProfileLoader::inlineHotFunctions(
}
for (CallBase *I : CIS) {
Function *CalledFunction = I->getCalledFunction();
+ InlineCandidate Candidate = {
+ I,
+ LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
+ : nullptr,
+ 0 /* dummy count */, 1.0 /* dummy distribution factor */};
// Do not inline recursive calls.
if (CalledFunction == &F)
continue;
@@ -1121,6 +1272,7 @@ bool SampleProfileLoader::inlineHotFunctions(
continue;
uint64_t Sum;
for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
+ uint64_t SumOrigin = Sum;
if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
PSI->getOrCompHotCountThreshold());
@@ -1129,65 +1281,34 @@ bool SampleProfileLoader::inlineHotFunctions(
if (!callsiteIsHot(FS, PSI))
continue;
- const char *Reason = "Callee function not available";
- // R->getValue() != &F is to prevent promoting a recursive call.
- // If it is a recursive call, we do not inline it as it could bloat
- // the code exponentially. There is way to better handle this, e.g.
- // clone the caller first, and inline the cloned caller if it is
- // recursive. As llvm does not inline recursive calls, we will
- // simply ignore it instead of handling it explicitly.
- auto CalleeFunctionName = FS->getFuncName();
- auto R = SymbolMap.find(CalleeFunctionName);
- if (R != SymbolMap.end() && R->getValue() &&
- !R->getValue()->isDeclaration() &&
- R->getValue()->getSubprogram() &&
- R->getValue()->hasFnAttribute("use-sample-profile") &&
- R->getValue() != &F &&
- isLegalToPromote(*I, R->getValue(), &Reason)) {
- uint64_t C = FS->getEntrySamples();
- auto &DI =
- pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE);
- Sum -= C;
- PromotedInsns.insert(I);
- // If profile mismatches, we should not attempt to inline DI.
- if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
- inlineCallInstruction(cast<CallBase>(DI))) {
- if (ProfileIsCS)
- ContextTracker->markContextSamplesInlined(FS);
- localNotInlinedCallSites.erase(I);
- LocalChanged = true;
- ++NumCSInlined;
- }
- } else {
- LLVM_DEBUG(dbgs()
- << "\nFailed to promote indirect call to "
- << CalleeFunctionName << " because " << Reason << "\n");
+ Candidate = {I, FS, FS->getEntrySamples(), 1.0};
+ if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+ PromotedInsns)) {
+ LocalNotInlinedCallSites.erase(I);
+ LocalChanged = true;
}
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
!CalledFunction->isDeclaration()) {
- if (inlineCallInstruction(*I)) {
- if (ProfileIsCS)
- ContextTracker->markContextSamplesInlined(
- localNotInlinedCallSites[I]);
- localNotInlinedCallSites.erase(I);
+ if (tryInlineCandidate(Candidate)) {
+ LocalNotInlinedCallSites.erase(I);
LocalChanged = true;
- ++NumCSInlined;
}
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
}
}
- if (LocalChanged) {
- Changed = true;
- } else {
- break;
- }
+ Changed |= LocalChanged;
}
+ // For CS profile, profile for not inlined context will be merged when
+ // base profile is being trieved
+ if (ProfileIsCS)
+ return Changed;
+
// Accumulate not inlined callsite information into notInlinedSamples
- for (const auto &Pair : localNotInlinedCallSites) {
+ for (const auto &Pair : LocalNotInlinedCallSites) {
CallBase *I = Pair.getFirst();
Function *Callee = I->getCalledFunction();
if (!Callee || Callee->isDeclaration())
@@ -1232,6 +1353,266 @@ bool SampleProfileLoader::inlineHotFunctions(
return Changed;
}
+bool SampleProfileLoader::tryInlineCandidate(
+ InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
+
+ CallBase &CB = *Candidate.CallInstr;
+ Function *CalledFunction = CB.getCalledFunction();
+ assert(CalledFunction && "Expect a callee with definition");
+ DebugLoc DLoc = CB.getDebugLoc();
+ BasicBlock *BB = CB.getParent();
+
+ InlineCost Cost = shouldInlineCandidate(Candidate);
+ if (Cost.isNever()) {
+ ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
+ << "incompatible inlining");
+ return false;
+ }
+
+ if (!Cost)
+ return false;
+
+ InlineFunctionInfo IFI(nullptr, GetAC);
+ if (InlineFunction(CB, IFI).isSuccess()) {
+ // The call to InlineFunction erases I, so we can't pass it here.
+ emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
+ true, CSINLINE_DEBUG);
+
+ // Now populate the list of newly exposed call sites.
+ if (InlinedCallSites) {
+ InlinedCallSites->clear();
+ for (auto &I : IFI.InlinedCallSites)
+ InlinedCallSites->push_back(I);
+ }
+
+ if (ProfileIsCS)
+ ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
+ ++NumCSInlined;
+
+ // Prorate inlined probes for a duplicated inlining callsite which probably
+ // has a distribution less than 100%. Samples for an inlinee should be
+ // distributed among the copies of the original callsite based on each
+ // callsite's distribution factor for counts accuracy. Note that an inlined
+ // probe may come with its own distribution factor if it has been duplicated
+ // in the inlinee body. The two factor are multiplied to reflect the
+ // aggregation of duplication.
+ if (Candidate.CallsiteDistribution < 1) {
+ for (auto &I : IFI.InlinedCallSites) {
+ if (Optional<PseudoProbe> Probe = extractProbe(*I))
+ setProbeDistributionFactor(*I, Probe->Factor *
+ Candidate.CallsiteDistribution);
+ }
+ NumDuplicatedInlinesite++;
+ }
+
+ return true;
+ }
+ return false;
+}
+
+bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
+ CallBase *CB) {
+ assert(CB && "Expect non-null call instruction");
+
+ if (isa<IntrinsicInst>(CB))
+ return false;
+
+ // Find the callee's profile. For indirect call, find hottest target profile.
+ const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
+ if (!CalleeSamples)
+ return false;
+
+ float Factor = 1.0;
+ if (Optional<PseudoProbe> Probe = extractProbe(*CB))
+ Factor = Probe->Factor;
+
+ uint64_t CallsiteCount = 0;
+ ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
+ if (Weight)
+ CallsiteCount = Weight.get();
+ if (CalleeSamples)
+ CallsiteCount = std::max(
+ CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
+
+ *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
+ return true;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+ std::unique_ptr<InlineAdvice> Advice = nullptr;
+ if (ExternalInlineAdvisor) {
+ Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
+ if (!Advice->isInliningRecommended()) {
+ Advice->recordUnattemptedInlining();
+ return InlineCost::getNever("not previously inlined");
+ }
+ Advice->recordInlining();
+ return InlineCost::getAlways("previously inlined");
+ }
+
+ // Adjust threshold based on call site hotness, only do this for callsite
+ // prioritized inliner because otherwise cost-benefit check is done earlier.
+ int SampleThreshold = SampleColdCallSiteThreshold;
+ if (CallsitePrioritizedInline) {
+ if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
+ SampleThreshold = SampleHotCallSiteThreshold;
+ else if (!ProfileSizeInline)
+ return InlineCost::getNever("cold callsite");
+ }
+
+ Function *Callee = Candidate.CallInstr->getCalledFunction();
+ assert(Callee && "Expect a definition for inline candidate of direct call");
+
+ InlineParams Params = getInlineParams();
+ Params.ComputeFullInlineCost = true;
+ // Checks if there is anything in the reachable portion of the callee at
+ // this callsite that makes this inlining potentially illegal. Need to
+ // set ComputeFullInlineCost, otherwise getInlineCost may return early
+ // when cost exceeds threshold without checking all IRs in the callee.
+ // The acutal cost does not matter because we only checks isNever() to
+ // see if it is legal to inline the callsite.
+ InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
+ GetTTI(*Callee), GetAC, GetTLI);
+
+ // Honor always inline and never inline from call analyzer
+ if (Cost.isNever() || Cost.isAlways())
+ return Cost;
+
+ // For old FDO inliner, we inline the call site as long as cost is not
+ // "Never". The cost-benefit check is done earlier.
+ if (!CallsitePrioritizedInline) {
+ return InlineCost::get(Cost.getCost(), INT_MAX);
+ }
+
+ // Otherwise only use the cost from call analyzer, but overwite threshold with
+ // Sample PGO threshold.
+ return InlineCost::get(Cost.getCost(), SampleThreshold);
+}
+
+bool SampleProfileLoader::inlineHotFunctionsWithPriority(
+ Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+ DenseSet<Instruction *> PromotedInsns;
+ assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+ // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+ // Profile symbol list is ignored when profile-sample-accurate is on.
+ assert((!ProfAccForSymsInList ||
+ (!ProfileSampleAccurate &&
+ !F.hasFnAttribute("profile-sample-accurate"))) &&
+ "ProfAccForSymsInList should be false when profile-sample-accurate "
+ "is enabled");
+
+ // Populating worklist with initial call sites from root inliner, along
+ // with call site weights.
+ CandidateQueue CQueue;
+ InlineCandidate NewCandidate;
+ for (auto &BB : F) {
+ for (auto &I : BB.getInstList()) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.push(NewCandidate);
+ }
+ }
+
+ // Cap the size growth from profile guided inlining. This is needed even
+ // though cost of each inline candidate already accounts for callee size,
+ // because with top-down inlining, we can grow inliner size significantly
+ // with large number of smaller inlinees each pass the cost check.
+ assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
+ "Max inline size limit should not be smaller than min inline size "
+ "limit.");
+ unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
+ SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+ SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+ if (ExternalInlineAdvisor)
+ SizeLimit = std::numeric_limits<unsigned>::max();
+
+ // Perform iterative BFS call site prioritized inlining
+ bool Changed = false;
+ while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
+ InlineCandidate Candidate = CQueue.top();
+ CQueue.pop();
+ CallBase *I = Candidate.CallInstr;
+ Function *CalledFunction = I->getCalledFunction();
+
+ if (CalledFunction == &F)
+ continue;
+ if (I->isIndirectCall()) {
+ if (PromotedInsns.count(I))
+ continue;
+ uint64_t Sum;
+ auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
+ uint64_t SumOrigin = Sum;
+ Sum *= Candidate.CallsiteDistribution;
+ for (const auto *FS : CalleeSamples) {
+ // TODO: Consider disable pre-lTO ICP for MonoLTO as well
+ if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+ FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+ PSI->getOrCompHotCountThreshold());
+ continue;
+ }
+ uint64_t EntryCountDistributed =
+ FS->getEntrySamples() * Candidate.CallsiteDistribution;
+ // In addition to regular inline cost check, we also need to make sure
+ // ICP isn't introducing excessive speculative checks even if individual
+ // target looks beneficial to promote and inline. That means we should
+ // only do ICP when there's a small number dominant targets.
+ if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
+ break;
+ // TODO: Fix CallAnalyzer to handle all indirect calls.
+ // For indirect call, we don't run CallAnalyzer to get InlineCost
+ // before actual inlining. This is because we could see two different
+ // types from the same definition, which makes CallAnalyzer choke as
+ // it's expecting matching parameter type on both caller and callee
+ // side. See example from PR18962 for the triggering cases (the bug was
+ // fixed, but we generate different types).
+ if (!PSI->isHotCount(EntryCountDistributed))
+ break;
+ SmallVector<CallBase *, 8> InlinedCallSites;
+ // Attach function profile for promoted indirect callee, and update
+ // call site count for the promoted inline candidate too.
+ Candidate = {I, FS, EntryCountDistributed,
+ Candidate.CallsiteDistribution};
+ if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+ PromotedInsns, &InlinedCallSites)) {
+ for (auto *CB : InlinedCallSites) {
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.emplace(NewCandidate);
+ }
+ Changed = true;
+ }
+ }
+ } else if (CalledFunction && CalledFunction->getSubprogram() &&
+ !CalledFunction->isDeclaration()) {
+ SmallVector<CallBase *, 8> InlinedCallSites;
+ if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
+ for (auto *CB : InlinedCallSites) {
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.emplace(NewCandidate);
+ }
+ Changed = true;
+ }
+ } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+ findCalleeFunctionSamples(*I)->findInlinedFunctions(
+ InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+ }
+ }
+
+ if (!CQueue.empty()) {
+ if (SizeLimit == (unsigned)ProfileInlineLimitMax)
+ ++NumCSInlinedHitMaxLimit;
+ else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
+ ++NumCSInlinedHitMinLimit;
+ else
+ ++NumCSInlinedHitGrowthLimit;
+ }
+
+ return Changed;
+}
+
/// Find equivalence classes for the given block.
///
/// This finds all the blocks that are guaranteed to execute the same
@@ -1654,6 +2035,14 @@ void SampleProfileLoader::propagateWeights(Function &F) {
auto T = FS->findCallTargetMapAt(CallSite);
if (!T || T.get().empty())
continue;
+ // Prorate the callsite counts to reflect what is already done to the
+ // callsite, such as ICP or calliste cloning.
+ if (FunctionSamples::ProfileIsProbeBased) {
+ if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+ if (Probe->Factor < 1)
+ T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
+ }
+ }
SmallVector<InstrProfValueData, 2> SortedCallTargets =
GetSortedValueDataFromCallTargets(T.get());
uint64_t Sum;
@@ -1833,7 +2222,10 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
}
DenseSet<GlobalValue::GUID> InlinedGUIDs;
- Changed |= inlineHotFunctions(F, InlinedGUIDs);
+ if (ProfileIsCS && CallsitePrioritizedInline)
+ Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
+ else
+ Changed |= inlineHotFunctions(F, InlinedGUIDs);
// Compute basic block weights.
Changed |= computeBlockWeights(F);
@@ -1898,6 +2290,45 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
+// Add inlined profile call edges to the call graph.
+void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
+ const FunctionSamples &Samples) {
+ Function *Caller = SymbolMap.lookup(Samples.getFuncName());
+ if (!Caller || Caller->isDeclaration())
+ return;
+
+ // Skip non-inlined call edges which are not important since top down inlining
+ // for non-CS profile is to get more precise profile matching, not to enable
+ // more inlining.
+
+ for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
+ for (const auto &InlinedSamples : CallsiteSamples.second) {
+ Function *Callee = SymbolMap.lookup(InlinedSamples.first);
+ if (Callee && !Callee->isDeclaration())
+ CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
+ addCallGraphEdges(CG, InlinedSamples.second);
+ }
+ }
+}
+
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleProfileLoader::replaceCallGraphEdges(
+ CallGraph &CG, StringMap<Function *> &SymbolMap) {
+ // Remove static call edges from the call graph except for the ones from the
+ // root which make the call graph connected.
+ for (const auto &Node : CG)
+ if (Node.second.get() != CG.getExternalCallingNode())
+ Node.second->removeAllCalledFunctions();
+
+ // Add profile call edges to the call graph.
+ if (ProfileIsCS) {
+ ContextTracker->addCallGraphEdges(CG, SymbolMap);
+ } else {
+ for (const auto &Samples : Reader->getProfiles())
+ addCallGraphEdges(CG, Samples.second);
+ }
+}
+
std::vector<Function *>
SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
std::vector<Function *> FunctionOrderList;
@@ -1920,16 +2351,97 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
}
assert(&CG->getModule() == &M);
+
+ // Add indirect call edges from profile to augment the static call graph.
+ // Functions will be processed in a top-down order defined by the static call
+ // graph. Adjusting the order by considering indirect call edges from the
+ // profile (which don't exist in the static call graph) can enable the
+ // inlining of indirect call targets by processing the caller before them.
+ // TODO: enable this for non-CS profile and fix the counts returning logic to
+ // have a full support for indirect calls.
+ if (UseProfileIndirectCallEdges && ProfileIsCS) {
+ for (auto &Entry : *CG) {
+ const auto *F = Entry.first;
+ if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
+ continue;
+ auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
+ if (AllContexts.empty())
+ continue;
+
+ for (const auto &BB : *F) {
+ for (const auto &I : BB.getInstList()) {
+ const auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB || !CB->isIndirectCall())
+ continue;
+ const DebugLoc &DLoc = I.getDebugLoc();
+ if (!DLoc)
+ continue;
+ auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
+ for (FunctionSamples *Samples : AllContexts) {
+ if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
+ for (const auto &Target : CallTargets.get()) {
+ Function *Callee = SymbolMap.lookup(Target.first());
+ if (Callee && !Callee->isDeclaration())
+ Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Compute a top-down order the profile which is used to sort functions in
+ // one SCC later. The static processing order computed for an SCC may not
+ // reflect the call contexts in the context-sensitive profile, thus may cause
+ // potential inlining to be overlooked. The function order in one SCC is being
+ // adjusted to a top-down order based on the profile to favor more inlining.
+ DenseMap<Function *, uint64_t> ProfileOrderMap;
+ if (UseProfileTopDownOrder ||
+ (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
+ // Create a static call graph. The call edges are not important since they
+ // will be replaced by dynamic edges from the profile.
+ CallGraph ProfileCG(M);
+ replaceCallGraphEdges(ProfileCG, SymbolMap);
+ scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
+ uint64_t I = 0;
+ while (!CGI.isAtEnd()) {
+ for (CallGraphNode *Node : *CGI) {
+ if (auto *F = Node->getFunction())
+ ProfileOrderMap[F] = ++I;
+ }
+ ++CGI;
+ }
+ }
+
scc_iterator<CallGraph *> CGI = scc_begin(CG);
while (!CGI.isAtEnd()) {
- for (CallGraphNode *node : *CGI) {
- auto F = node->getFunction();
+ uint64_t Start = FunctionOrderList.size();
+ for (CallGraphNode *Node : *CGI) {
+ auto *F = Node->getFunction();
if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
FunctionOrderList.push_back(F);
}
+
+ // Sort nodes in SCC based on the profile top-down order.
+ if (!ProfileOrderMap.empty()) {
+ std::stable_sort(FunctionOrderList.begin() + Start,
+ FunctionOrderList.end(),
+ [&ProfileOrderMap](Function *Left, Function *Right) {
+ return ProfileOrderMap[Left] < ProfileOrderMap[Right];
+ });
+ }
+
++CGI;
}
+ LLVM_DEBUG({
+ dbgs() << "Function processing order:\n";
+ for (auto F : reverse(FunctionOrderList)) {
+ dbgs() << F->getName() << "\n";
+ }
+ });
+
std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
return FunctionOrderList;
}
@@ -1978,6 +2490,12 @@ bool SampleProfileLoader::doInitialization(Module &M,
ProfileIsCS = true;
FunctionSamples::ProfileIsCS = true;
+ // Enable priority-base inliner and size inline by default for CSSPGO.
+ if (!ProfileSizeInline.getNumOccurrences())
+ ProfileSizeInline = true;
+ if (!CallsitePrioritizedInline.getNumOccurrences())
+ CallsitePrioritizedInline = true;
+
// Tracker for profiles under different context
ContextTracker =
std::make_unique<SampleContextTracker>(Reader->getProfiles());
@@ -2075,6 +2593,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
}
bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
+ LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
DILocation2SampleMap.clear();
// By default the entry count is initialized to -1, which will be treated
// conservatively by getEntryCount as the same as unknown (None). This is
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 7cecd20b78d8..a885c3ee4ded 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -12,6 +12,7 @@
#include "llvm/Transforms/IPO/SampleProfileProbe.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -25,8 +26,10 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/ProfileData/SampleProf.h"
#include "llvm/Support/CRC.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <unordered_set>
#include <vector>
using namespace llvm;
@@ -35,6 +38,115 @@ using namespace llvm;
STATISTIC(ArtificialDbgLine,
"Number of probes that have an artificial debug line");
+static cl::opt<bool>
+ VerifyPseudoProbe("verify-pseudo-probe", cl::init(false), cl::Hidden,
+ cl::desc("Do pseudo probe verification"));
+
+static cl::list<std::string> VerifyPseudoProbeFuncList(
+ "verify-pseudo-probe-funcs", cl::Hidden,
+ cl::desc("The option to specify the name of the functions to verify."));
+
+static cl::opt<bool>
+ UpdatePseudoProbe("update-pseudo-probe", cl::init(true), cl::Hidden,
+ cl::desc("Update pseudo probe distribution factor"));
+
+bool PseudoProbeVerifier::shouldVerifyFunction(const Function *F) {
+ // Skip function declaration.
+ if (F->isDeclaration())
+ return false;
+ // Skip function that will not be emitted into object file. The prevailing
+ // defintion will be verified instead.
+ if (F->hasAvailableExternallyLinkage())
+ return false;
+ // Do a name matching.
+ static std::unordered_set<std::string> VerifyFuncNames(
+ VerifyPseudoProbeFuncList.begin(), VerifyPseudoProbeFuncList.end());
+ return VerifyFuncNames.empty() || VerifyFuncNames.count(F->getName().str());
+}
+
+void PseudoProbeVerifier::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+ if (VerifyPseudoProbe) {
+ PIC.registerAfterPassCallback(
+ [this](StringRef P, Any IR, const PreservedAnalyses &) {
+ this->runAfterPass(P, IR);
+ });
+ }
+}
+
+// Callback to run after each transformation for the new pass manager.
+void PseudoProbeVerifier::runAfterPass(StringRef PassID, Any IR) {
+ std::string Banner =
+ "\n*** Pseudo Probe Verification After " + PassID.str() + " ***\n";
+ dbgs() << Banner;
+ if (any_isa<const Module *>(IR))
+ runAfterPass(any_cast<const Module *>(IR));
+ else if (any_isa<const Function *>(IR))
+ runAfterPass(any_cast<const Function *>(IR));
+ else if (any_isa<const LazyCallGraph::SCC *>(IR))
+ runAfterPass(any_cast<const LazyCallGraph::SCC *>(IR));
+ else if (any_isa<const Loop *>(IR))
+ runAfterPass(any_cast<const Loop *>(IR));
+ else
+ llvm_unreachable("Unknown IR unit");
+}
+
+void PseudoProbeVerifier::runAfterPass(const Module *M) {
+ for (const Function &F : *M)
+ runAfterPass(&F);
+}
+
+void PseudoProbeVerifier::runAfterPass(const LazyCallGraph::SCC *C) {
+ for (const LazyCallGraph::Node &N : *C)
+ runAfterPass(&N.getFunction());
+}
+
+void PseudoProbeVerifier::runAfterPass(const Function *F) {
+ if (!shouldVerifyFunction(F))
+ return;
+ ProbeFactorMap ProbeFactors;
+ for (const auto &BB : *F)
+ collectProbeFactors(&BB, ProbeFactors);
+ verifyProbeFactors(F, ProbeFactors);
+}
+
+void PseudoProbeVerifier::runAfterPass(const Loop *L) {
+ const Function *F = L->getHeader()->getParent();
+ runAfterPass(F);
+}
+
+void PseudoProbeVerifier::collectProbeFactors(const BasicBlock *Block,
+ ProbeFactorMap &ProbeFactors) {
+ for (const auto &I : *Block) {
+ if (Optional<PseudoProbe> Probe = extractProbe(I))
+ ProbeFactors[Probe->Id] += Probe->Factor;
+ }
+}
+
+void PseudoProbeVerifier::verifyProbeFactors(
+ const Function *F, const ProbeFactorMap &ProbeFactors) {
+ bool BannerPrinted = false;
+ auto &PrevProbeFactors = FunctionProbeFactors[F->getName()];
+ for (const auto &I : ProbeFactors) {
+ float CurProbeFactor = I.second;
+ if (PrevProbeFactors.count(I.first)) {
+ float PrevProbeFactor = PrevProbeFactors[I.first];
+ if (std::abs(CurProbeFactor - PrevProbeFactor) >
+ DistributionFactorVariance) {
+ if (!BannerPrinted) {
+ dbgs() << "Function " << F->getName() << ":\n";
+ BannerPrinted = true;
+ }
+ dbgs() << "Probe " << I.first << "\tprevious factor "
+ << format("%0.2f", PrevProbeFactor) << "\tcurrent factor "
+ << format("%0.2f", CurProbeFactor) << "\n";
+ }
+ }
+
+ // Update
+ PrevProbeFactors[I.first] = I.second;
+ }
+}
+
PseudoProbeManager::PseudoProbeManager(const Module &M) {
if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
for (const auto *Operand : FuncInfo->operands()) {
@@ -201,7 +313,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
Function *ProbeFn =
llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe);
Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index),
- Builder.getInt32(0)};
+ Builder.getInt32(0),
+ Builder.getInt64(PseudoProbeFullDistributionFactor)};
auto *Probe = Builder.CreateCall(ProbeFn, Args);
AssignDebugLoc(Probe);
}
@@ -219,7 +332,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
// Levarge the 32-bit discriminator field of debug data to store the ID and
// type of a callsite probe. This gets rid of the dependency on plumbing a
// customized metadata through the codegen pipeline.
- uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(Index, Type);
+ uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(
+ Index, Type, 0, PseudoProbeDwarfDiscriminator::FullDistributionFactor);
if (auto DIL = Call->getDebugLoc()) {
DIL = DIL->cloneWithDiscriminator(V);
Call->setDebugLoc(DIL);
@@ -274,3 +388,47 @@ PreservedAnalyses SampleProfileProbePass::run(Module &M,
return PreservedAnalyses::none();
}
+
+void PseudoProbeUpdatePass::runOnFunction(Function &F,
+ FunctionAnalysisManager &FAM) {
+ BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ auto BBProfileCount = [&BFI](BasicBlock *BB) {
+ return BFI.getBlockProfileCount(BB)
+ ? BFI.getBlockProfileCount(BB).getValue()
+ : 0;
+ };
+
+ // Collect the sum of execution weight for each probe.
+ ProbeFactorMap ProbeFactors;
+ for (auto &Block : F) {
+ for (auto &I : Block) {
+ if (Optional<PseudoProbe> Probe = extractProbe(I))
+ ProbeFactors[Probe->Id] += BBProfileCount(&Block);
+ }
+ }
+
+ // Fix up over-counted probes.
+ for (auto &Block : F) {
+ for (auto &I : Block) {
+ if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+ float Sum = ProbeFactors[Probe->Id];
+ if (Sum != 0)
+ setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum);
+ }
+ }
+ }
+}
+
+PreservedAnalyses PseudoProbeUpdatePass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (UpdatePseudoProbe) {
+ for (auto &F : M) {
+ if (F.isDeclaration())
+ continue;
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ runOnFunction(F, FAM);
+ }
+ }
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0b53007bb6dc..07e68c44416d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1270,6 +1270,7 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+ LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
(transformZExtICmp(LHS, CI, false) ||
transformZExtICmp(RHS, CI, false))) {
// zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index d687ec654438..b211b0813611 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
for (++BBI; BBI != E; ++BBI)
- if (BBI->mayWriteToMemory())
+ if (BBI->mayWriteToMemory()) {
+ // Calls that only access inaccessible memory do not block sinking the
+ // load.
+ if (auto *CB = dyn_cast<CallBase>(BBI))
+ if (CB->onlyAccessesInaccessibleMemory())
+ continue;
return false;
+ }
// Check for non-address taken alloca. If not address-taken already, it isn't
// profitable to do this xform.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index c265516213aa..16efe863779a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -345,10 +345,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
return false;
// Get the constant out of the ICmp, if there is one.
+ // Only try this when exactly 1 operand is a constant (if both operands
+ // are constant, the icmp should eventually simplify). Otherwise, we may
+ // invert the transform that reduces set bits and infinite-loop.
+ Value *X;
const APInt *CmpC;
ICmpInst::Predicate Pred;
- if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) ||
- CmpC->getBitWidth() != SelC->getBitWidth())
+ if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+ isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
return ShrinkDemandedConstant(I, OpNo, DemandedMask);
// If the constant is already the same as the ICmp, leave it as-is.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 518e909e8ab4..828fd49524ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3878,9 +3878,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
}
}
- // Skip processing debug intrinsics in InstCombine. Processing these call instructions
- // consumes non-trivial amount of time and provides no value for the optimization.
- if (!isa<DbgInfoIntrinsic>(Inst)) {
+ // Skip processing debug and pseudo intrinsics in InstCombine. Processing
+ // these call instructions consumes non-trivial amount of time and
+ // provides no value for the optimization.
+ if (!Inst->isDebugOrPseudoInst()) {
InstrsForInstCombineWorklist.push_back(Inst);
SeenAliasScopes.analyse(Inst);
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
index 2b649732a799..ce4e5e575fbf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -325,7 +325,7 @@ void AggressiveDeadCodeElimination::initialize() {
bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
// TODO -- use llvm::isInstructionTriviallyDead
- if (I.isEHPad() || I.mayHaveSideEffects()) {
+ if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) {
// Skip any value profile instrumentation calls if they are
// instrumenting constants.
if (isInstrumentsConstant(I))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 96aef90c1c1a..10b08b4e2224 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2076,6 +2076,15 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
ValueMapping[PN] = NewPN;
}
+ // Clone noalias scope declarations in the threaded block. When threading a
+ // loop exit, we would otherwise end up with two idential scope declarations
+ // visible at the same time.
+ SmallVector<MDNode *> NoAliasScopes;
+ DenseMap<MDNode *, MDNode *> ClonedScopes;
+ LLVMContext &Context = PredBB->getContext();
+ identifyNoAliasScopesToClone(BI, BE, NoAliasScopes);
+ cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
+
// Clone the non-phi instructions of the source basic block into NewBB,
// keeping track of the mapping and using it to remap operands in the cloned
// instructions.
@@ -2084,6 +2093,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
New->setName(BI->getName());
NewBB->getInstList().push_back(New);
ValueMapping[&*BI] = New;
+ adaptNoAliasScopes(New, ClonedScopes, Context);
// Remap operands to patch up intra-block references.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 18717394d384..822a786fc7c7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1114,12 +1114,16 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(
Loop *L = LI->getLoopFor(I->getParent());
auto *DefiningAccess = MemA->getDefiningAccess();
- // If the defining access is a MemoryPhi in the header, get the incoming
- // value for the pre-header as defining access.
- if (DefiningAccess->getBlock() == I->getParent()) {
+ // Get the first defining access before the loop.
+ while (L->contains(DefiningAccess->getBlock())) {
+ // If the defining access is a MemoryPhi, get the incoming
+ // value for the pre-header as defining access.
if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
DefiningAccess =
MemPhi->getIncomingValueForBlock(L->getLoopPreheader());
+ } else {
+ DefiningAccess =
+ cast<MemoryDef>(DefiningAccess)->getDefiningAccess();
}
}
MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(),
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
index d111a6ba4241..af510f1a84bf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2524,7 +2524,7 @@ private:
NewAI.getAlign(), LI.isVolatile(),
LI.getName());
if (AATags)
- NewLI->setAAMetadata(AATags);
+ NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
if (NewLI->isAtomic())
@@ -2563,7 +2563,7 @@ private:
IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
getSliceAlign(), LI.isVolatile(), LI.getName());
if (AATags)
- NewLI->setAAMetadata(AATags);
+ NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
@@ -2626,7 +2626,7 @@ private:
}
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
if (AATags)
- Store->setAAMetadata(AATags);
+ Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
Pass.DeadInsts.push_back(&SI);
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
@@ -2650,7 +2650,7 @@ private:
Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
LLVMContext::MD_access_group});
if (AATags)
- Store->setAAMetadata(AATags);
+ Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
Pass.DeadInsts.push_back(&SI);
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
@@ -2720,7 +2720,7 @@ private:
NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
LLVMContext::MD_access_group});
if (AATags)
- NewSI->setAAMetadata(AATags);
+ NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
if (SI.isVolatile())
NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
if (NewSI->isAtomic())
@@ -2816,7 +2816,7 @@ private:
getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
MaybeAlign(getSliceAlign()), II.isVolatile());
if (AATags)
- New->setAAMetadata(AATags);
+ New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -2885,7 +2885,7 @@ private:
StoreInst *New =
IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
if (AATags)
- New->setAAMetadata(AATags);
+ New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return !II.isVolatile();
}
@@ -3006,7 +3006,7 @@ private:
CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
Size, II.isVolatile());
if (AATags)
- New->setAAMetadata(AATags);
+ New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -3060,7 +3060,7 @@ private:
LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
II.isVolatile(), "copyload");
if (AATags)
- Load->setAAMetadata(AATags);
+ Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
Src = Load;
}
@@ -3080,7 +3080,7 @@ private:
StoreInst *Store = cast<StoreInst>(
IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
if (AATags)
- Store->setAAMetadata(AATags);
+ Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return !II.isVolatile();
}
@@ -3381,8 +3381,13 @@ private:
IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
LoadInst *Load =
IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
- if (AATags)
- Load->setAAMetadata(AATags);
+
+ APInt Offset(
+ DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+ if (AATags &&
+ GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+ Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
}
@@ -3428,8 +3433,13 @@ private:
IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
StoreInst *Store =
IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
- if (AATags)
- Store->setAAMetadata(AATags);
+
+ APInt Offset(
+ DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+ if (AATags &&
+ GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+ Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
}
};
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index f4afa3ad4623..dba5403f272a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -170,16 +170,6 @@ static bool setRetAndArgsNoUndef(Function &F) {
return setRetNoUndef(F) | setArgsNoUndef(F);
}
-static bool setRetNonNull(Function &F) {
- assert(F.getReturnType()->isPointerTy() &&
- "nonnull applies only to pointers");
- if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull))
- return false;
- F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
- ++NumNonNull;
- return true;
-}
-
static bool setReturnedArg(Function &F, unsigned ArgNo) {
if (F.hasParamAttribute(ArgNo, Attribute::Returned))
return false;
@@ -1005,63 +995,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
Changed |= setDoesNotCapture(F, 1);
return Changed;
- case LibFunc_ZdlPvRKSt9nothrow_t: // delete(void*, nothrow)
- case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t: // delete(void*, align_val_t, nothrow)
- case LibFunc_ZdaPvRKSt9nothrow_t: // delete[](void*, nothrow)
- case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t: // delete[](void*, align_val_t, nothrow)
- Changed |= setDoesNotThrow(F);
- LLVM_FALLTHROUGH;
- case LibFunc_ZdlPv: // delete(void*)
- case LibFunc_ZdlPvj: // delete(void*, unsigned int)
- case LibFunc_ZdlPvm: // delete(void*, unsigned long)
- case LibFunc_ZdaPv: // delete[](void*)
- case LibFunc_ZdaPvj: // delete[](void*, unsigned int)
- case LibFunc_ZdaPvm: // delete[](void*, unsigned long)
- case LibFunc_ZdlPvSt11align_val_t: // delete(void*, align_val_t)
- case LibFunc_ZdlPvjSt11align_val_t: // delete(void*, unsigned int, align_val_t)
- case LibFunc_ZdlPvmSt11align_val_t: // delete(void*, unsigned long, align_val_t)
- case LibFunc_ZdaPvSt11align_val_t: // delete[](void*, align_val_t)
- case LibFunc_ZdaPvjSt11align_val_t: // delete[](void*, unsigned int, align_val_t)
- case LibFunc_ZdaPvmSt11align_val_t: // delete[](void*, unsigned long, align_val_t);
- Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
- Changed |= setArgsNoUndef(F);
- Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_ZnwjRKSt9nothrow_t: // new(unsigned int, nothrow)
- case LibFunc_ZnwmRKSt9nothrow_t: // new(unsigned long, nothrow)
- case LibFunc_ZnajRKSt9nothrow_t: // new[](unsigned int, nothrow)
- case LibFunc_ZnamRKSt9nothrow_t: // new[](unsigned long, nothrow)
- case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t: // new(unsigned int, align_val_t, nothrow)
- case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: // new(unsigned long, align_val_t, nothrow)
- case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t: // new[](unsigned int, align_val_t, nothrow)
- case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: // new[](unsigned long, align_val_t, nothrow)
- // Nothrow operator new may return null pointer
- Changed |= setDoesNotThrow(F);
- Changed |= setOnlyAccessesInaccessibleMemory(F);
- Changed |= setRetNoUndef(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_Znwj: // new(unsigned int)
- case LibFunc_Znwm: // new(unsigned long)
- case LibFunc_Znaj: // new[](unsigned int)
- case LibFunc_Znam: // new[](unsigned long)
- case LibFunc_ZnwjSt11align_val_t: // new(unsigned int, align_val_t)
- case LibFunc_ZnwmSt11align_val_t: // new(unsigned long, align_val_t)
- case LibFunc_ZnajSt11align_val_t: // new[](unsigned int, align_val_t)
- case LibFunc_ZnamSt11align_val_t: // new[](unsigned long, align_val_t)
- case LibFunc_msvc_new_int: // new(unsigned int)
- case LibFunc_msvc_new_longlong: // new(unsigned long long)
- case LibFunc_msvc_new_array_int: // new[](unsigned int)
- case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
- Changed |= setOnlyAccessesInaccessibleMemory(F);
- // Operator new always returns a nonnull noalias pointer
- Changed |= setRetNoUndef(F);
- Changed |= setRetNonNull(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setWillReturn(F);
- return Changed;
// TODO: add LibFunc entries for:
// case LibFunc_memset_pattern4:
// case LibFunc_memset_pattern8:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 51a49574e55d..6ab061510a60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -989,3 +989,11 @@ void llvm::identifyNoAliasScopesToClone(
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
NoAliasDeclScopes.push_back(Decl->getScopeList());
}
+
+void llvm::identifyNoAliasScopesToClone(
+ BasicBlock::iterator Start, BasicBlock::iterator End,
+ SmallVectorImpl<MDNode *> &NoAliasDeclScopes) {
+ for (Instruction &I : make_range(Start, End))
+ if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+ NoAliasDeclScopes.push_back(Decl->getScopeList());
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 0ac8fa537f4e..3026342cc4a6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -921,14 +921,20 @@ void ScopedAliasMetadataDeepCloner::remap(ValueToValueMapTy &VMap) {
if (!I)
continue;
+ // Only update scopes when we find them in the map. If they are not, it is
+ // because we already handled that instruction before. This is faster than
+ // tracking which instructions we already updated.
if (MDNode *M = I->getMetadata(LLVMContext::MD_alias_scope))
- I->setMetadata(LLVMContext::MD_alias_scope, MDMap[M]);
+ if (MDNode *MNew = MDMap.lookup(M))
+ I->setMetadata(LLVMContext::MD_alias_scope, MNew);
if (MDNode *M = I->getMetadata(LLVMContext::MD_noalias))
- I->setMetadata(LLVMContext::MD_noalias, MDMap[M]);
+ if (MDNode *MNew = MDMap.lookup(M))
+ I->setMetadata(LLVMContext::MD_noalias, MNew);
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
- Decl->setScopeList(MDMap[Decl->getScopeList()]);
+ if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
+ Decl->setScopeList(MNew);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
index 477ea458c763..ae26058c210c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
@@ -420,13 +420,8 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
return true;
}
- if (auto *CB = dyn_cast<CallBase>(I)) {
- // Treat calls that may not return as alive.
- // TODO: Remove the intrinsic escape hatch once all intrinsics set
- // willreturn properly.
- if (!CB->willReturn() && !isa<IntrinsicInst>(I))
- return false;
- }
+ if (!I->willReturn())
+ return false;
if (!I->mayHaveSideEffects())
return true;
@@ -923,6 +918,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
/// \param IncomingValues A map from block to value.
static void replaceUndefValuesInPhi(PHINode *PN,
const IncomingValueMap &IncomingValues) {
+ SmallVector<unsigned> TrueUndefOps;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
Value *V = PN->getIncomingValue(i);
@@ -930,10 +926,31 @@ static void replaceUndefValuesInPhi(PHINode *PN,
BasicBlock *BB = PN->getIncomingBlock(i);
IncomingValueMap::const_iterator It = IncomingValues.find(BB);
- if (It == IncomingValues.end()) continue;
+ // Keep track of undef/poison incoming values. Those must match, so we fix
+ // them up below if needed.
+ // Note: this is conservatively correct, but we could try harder and group
+ // the undef values per incoming basic block.
+ if (It == IncomingValues.end()) {
+ TrueUndefOps.push_back(i);
+ continue;
+ }
+
+ // There is a defined value for this incoming block, so map this undef
+ // incoming value to the defined value.
PN->setIncomingValue(i, It->second);
}
+
+ // If there are both undef and poison values incoming, then convert those
+ // values to undef. It is invalid to have different values for the same
+ // incoming block.
+ unsigned PoisonCount = count_if(TrueUndefOps, [&](unsigned i) {
+ return isa<PoisonValue>(PN->getIncomingValue(i));
+ });
+ if (PoisonCount != 0 && PoisonCount != TrueUndefOps.size()) {
+ for (unsigned i : TrueUndefOps)
+ PN->setIncomingValue(i, UndefValue::get(PN->getType()));
+ }
}
/// Replace a value flowing from a block to a phi with
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
index cb5fee7d28e6..befacb591762 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -509,7 +509,7 @@ static void cloneLoopBlocks(
SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
- LoopInfo *LI) {
+ LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
BasicBlock *PreHeader = L->getLoopPreheader();
@@ -545,6 +545,15 @@ static void cloneLoopBlocks(
}
}
+ {
+ // Identify what other metadata depends on the cloned version. After
+ // cloning, replace the metadata with the corrected version for both
+ // memory instructions and noalias intrinsics.
+ std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
+ cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
+ Header->getContext(), Ext);
+ }
+
// Recursively create the new Loop objects for nested loops, if any,
// to preserve LoopInfo.
for (Loop *ChildLoop : *L) {
@@ -769,13 +778,19 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
uint64_t ExitWeight = 0, FallThroughWeight = 0;
initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+ // Identify what noalias metadata is inside the loop: if it is inside the
+ // loop, the associated metadata must be cloned for each iteration.
+ SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
+ identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
+
// For each peeled-off iteration, make a copy of the loop.
for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
SmallVector<BasicBlock *, 8> NewBlocks;
ValueToValueMapTy VMap;
cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
- LoopBlocks, VMap, LVMap, DT, LI);
+ LoopBlocks, VMap, LVMap, DT, LI,
+ LoopLocalNoAliasDeclScopes);
// Remap to use values from the current iteration instead of the
// previous one.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7cfe17618cde..de9560df9785 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1628,6 +1628,11 @@ static bool canSinkInstructions(
I->getType()->isTokenTy())
return false;
+ // Do not try to sink an instruction in an infinite loop - it can cause
+ // this algorithm to infinite loop.
+ if (I->getParent()->getSingleSuccessor() == I->getParent())
+ return false;
+
// Conservatively return false if I is an inline-asm instruction. Sinking
// and merging inline-asm instructions can potentially create arguments
// that cannot satisfy the inline-asm constraints.
@@ -1714,13 +1719,13 @@ static bool canSinkInstructions(
return true;
}
-// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last
+// Assuming canSinkInstructions(Blocks) has returned true, sink the last
// instruction of every block in Blocks to their common successor, commoning
// into one instruction.
static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
- // canSinkLastInstruction returning true guarantees that every block has at
+ // canSinkInstructions returning true guarantees that every block has at
// least one non-terminator instruction.
SmallVector<Instruction*,4> Insts;
for (auto *BB : Blocks) {
@@ -1733,9 +1738,9 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
}
// The only checking we need to do now is that all users of all instructions
- // are the same PHI node. canSinkLastInstruction should have checked this but
- // it is slightly over-aggressive - it gets confused by commutative instructions
- // so double-check it here.
+ // are the same PHI node. canSinkInstructions should have checked this but
+ // it is slightly over-aggressive - it gets confused by commutative
+ // instructions so double-check it here.
Instruction *I0 = Insts.front();
if (!I0->user_empty()) {
auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
@@ -1746,11 +1751,11 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
return false;
}
- // We don't need to do any more checking here; canSinkLastInstruction should
+ // We don't need to do any more checking here; canSinkInstructions should
// have done it all for us.
SmallVector<Value*, 4> NewOperands;
for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
- // This check is different to that in canSinkLastInstruction. There, we
+ // This check is different to that in canSinkInstructions. There, we
// cared about the global view once simplifycfg (and instcombine) have
// completed - it takes into account PHIs that become trivially
// simplifiable. However here we need a more local view; if an operand
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1795470fa58c..19797e6f7858 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -142,6 +142,10 @@ public:
return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
}
+ VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea0d7673edf6..b456a97aa4ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -372,19 +372,11 @@ static Type *getMemInstValueType(Value *I) {
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
-/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
- // Determine if an array of VF elements of type Ty is "bitcast compatible"
- // with a <VF x Ty> vector.
- if (VF.isVector()) {
- auto *VectorTy = VectorType::get(Ty, VF);
- return TypeSize::get(VF.getKnownMinValue() *
- DL.getTypeAllocSize(Ty).getFixedValue(),
- VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
- }
-
- // If the vectorization factor is one, we just check if an array of type Ty
- // requires padding between elements.
+/// element of the corresponding vector type.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
+ // Determine if an array of N elements of type Ty is "bitcast compatible"
+ // with a <N x Ty> vector.
+ // This is only true if there is no padding between the array elements.
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
@@ -5212,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = getMemInstValueType(I);
- if (hasIrregularType(ScalarTy, DL, VF))
+ if (hasIrregularType(ScalarTy, DL))
return false;
// Check if masking is required.
@@ -5259,7 +5251,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
- if (hasIrregularType(ScalarTy, DL, VF))
+ if (hasIrregularType(ScalarTy, DL))
return false;
return true;
@@ -5504,11 +5496,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
- ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
-
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return MaxVF;
+ return computeFeasibleMaxVF(TC, UserVF);
case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5546,7 +5536,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return MaxVF;
+ return computeFeasibleMaxVF(TC, UserVF);
}
return None;
}
@@ -5563,6 +5553,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
+ ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
assert(!MaxVF.isScalable() &&
"Scalable vectors do not yet support tail folding");
assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
@@ -8196,8 +8187,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (BI->getSuccessor(0) != Dst)
EdgeMask = Builder.createNot(EdgeMask);
- if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
- EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+ if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+ // The condition is 'SrcMask && EdgeMask', which is equivalent to
+ // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ // The select version does not introduce new UB if SrcMask is false and
+ // EdgeMask is poison. Using 'and' here introduces undefined behavior.
+ VPValue *False = Plan->getOrAddVPValue(
+ ConstantInt::getFalse(BI->getCondition()->getType()));
+ EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+ }
return EdgeMaskCache[Edge] = EdgeMask;
}