aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-02 21:49:08 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:43:49 +0000
commit4824e7fd18a1223177218d4aec1b3c6c5c4a444e (patch)
tree5ca6493b1b0bf6a41f257794c0116d5e50fbf37c /contrib/llvm-project/llvm/lib/Transforms
parent5e801ac66d24704442eba426ed13c3effb8a34e7 (diff)
parentf65dcba83ce5035ab88a85fe17628b447eb56e1b (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp132
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp90
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp69
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp66
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp54
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp127
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp67
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp189
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp71
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp462
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp52
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp980
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp612
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h23
43 files changed, 2179 insertions, 1178 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b2c2efed7db8..ba7589c2bf60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -275,94 +276,64 @@ CleanupPointerRootUsers(GlobalVariable *GV,
/// We just marked GV constant. Loop over all users of the global, cleaning up
/// the obvious ones. This is largely just a quick scan over the use list to
/// clean up the easy and obvious cruft. This returns true if it made a change.
-static bool CleanupConstantGlobalUsers(
- Value *V, Constant *Init, const DataLayout &DL,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
+ const DataLayout &DL) {
+ Constant *Init = GV->getInitializer();
+ SmallVector<User *, 8> WorkList(GV->users());
+ SmallPtrSet<User *, 8> Visited;
bool Changed = false;
- // Note that we need to use a weak value handle for the worklist items. When
- // we delete a constant array, we may also be holding pointer to one of its
- // elements (or an element of one of its elements if we're dealing with an
- // array of arrays) in the worklist.
- SmallVector<WeakTrackingVH, 8> WorkList(V->users());
+
+ SmallVector<WeakTrackingVH> MaybeDeadInsts;
+ auto EraseFromParent = [&](Instruction *I) {
+ for (Value *Op : I->operands())
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ MaybeDeadInsts.push_back(OpI);
+ I->eraseFromParent();
+ Changed = true;
+ };
while (!WorkList.empty()) {
- Value *UV = WorkList.pop_back_val();
- if (!UV)
+ User *U = WorkList.pop_back_val();
+ if (!Visited.insert(U).second)
continue;
- User *U = cast<User>(UV);
+ if (auto *BO = dyn_cast<BitCastOperator>(U))
+ append_range(WorkList, BO->users());
+ if (auto *ASC = dyn_cast<AddrSpaceCastOperator>(U))
+ append_range(WorkList, ASC->users());
+ else if (auto *GEP = dyn_cast<GEPOperator>(U))
+ append_range(WorkList, GEP->users());
+ else if (auto *LI = dyn_cast<LoadInst>(U)) {
+ // A load from zeroinitializer is always zeroinitializer, regardless of
+ // any applied offset.
+ if (Init->isNullValue()) {
+ LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
+ EraseFromParent(LI);
+ continue;
+ }
- if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
- if (Init) {
- if (auto *Casted =
- ConstantFoldLoadThroughBitcast(Init, LI->getType(), DL)) {
- // Replace the load with the initializer.
- LI->replaceAllUsesWith(Casted);
- LI->eraseFromParent();
- Changed = true;
+ Value *PtrOp = LI->getPointerOperand();
+ APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
+ DL, Offset, /* AllowNonInbounds */ true);
+ if (PtrOp == GV) {
+ if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
+ Offset, DL)) {
+ LI->replaceAllUsesWith(Value);
+ EraseFromParent(LI);
}
}
} else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
// Store must be unreachable or storing Init into the global.
- SI->eraseFromParent();
- Changed = true;
- } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
- if (CE->getOpcode() == Instruction::GetElementPtr) {
- Constant *SubInit = nullptr;
- if (Init)
- SubInit = ConstantFoldLoadThroughGEPConstantExpr(
- Init, CE, V->getType()->getPointerElementType(), DL);
- Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
- } else if ((CE->getOpcode() == Instruction::BitCast &&
- CE->getType()->isPointerTy()) ||
- CE->getOpcode() == Instruction::AddrSpaceCast) {
- // Pointer cast, delete any stores and memsets to the global.
- Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
- }
-
- if (CE->use_empty()) {
- CE->destroyConstant();
- Changed = true;
- }
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
- // Do not transform "gepinst (gep constexpr (GV))" here, because forming
- // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
- // and will invalidate our notion of what Init is.
- Constant *SubInit = nullptr;
- if (!isa<ConstantExpr>(GEP->getOperand(0))) {
- ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
- ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
- if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
- SubInit = ConstantFoldLoadThroughGEPConstantExpr(
- Init, CE, V->getType()->getPointerElementType(), DL);
-
- // If the initializer is an all-null value and we have an inbounds GEP,
- // we already know what the result of any load from that GEP is.
- // TODO: Handle splats.
- if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
- SubInit = Constant::getNullValue(GEP->getResultElementType());
- }
- Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
-
- if (GEP->use_empty()) {
- GEP->eraseFromParent();
- Changed = true;
- }
+ EraseFromParent(SI);
} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
- if (MI->getRawDest() == V) {
- MI->eraseFromParent();
- Changed = true;
- }
-
- } else if (Constant *C = dyn_cast<Constant>(U)) {
- // If we have a chain of dead constantexprs or other things dangling from
- // us, and if they are all dead, nuke them without remorse.
- if (isSafeToDestroyConstant(C)) {
- C->destroyConstant();
- CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
- return true;
- }
+ if (getUnderlyingObject(MI->getRawDest()) == GV)
+ EraseFromParent(MI);
}
}
+
+ Changed |=
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(MaybeDeadInsts);
+ GV->removeDeadConstantUsers();
return Changed;
}
@@ -889,7 +860,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(
Changed |= CleanupPointerRootUsers(GV, GetTLI);
} else {
Changed = true;
- CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
+ CleanupConstantGlobalUsers(GV, DL);
}
if (GV->use_empty()) {
LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
@@ -1557,8 +1528,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
} else {
// Delete any stores we can find to the global. We may not be able to
// make it completely dead though.
- Changed =
- CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+ Changed = CleanupConstantGlobalUsers(GV, DL);
}
// If the global is dead now, delete it.
@@ -1583,7 +1553,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
}
// Clean up any obviously simplifiable users now.
- Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+ Changed |= CleanupConstantGlobalUsers(GV, DL);
// If the global is dead now, just nuke it.
if (GV->use_empty()) {
@@ -1628,7 +1598,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
GV->setInitializer(SOVConstant);
// Clean up any obviously simplifiable users now.
- CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+ CleanupConstantGlobalUsers(GV, DL);
if (GV->use_empty()) {
LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to "
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f342c35fa283..055ee6b50296 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1885,6 +1885,7 @@ private:
OMPRTL___kmpc_barrier_simple_generic);
ExternalizationRAII ThreadId(OMPInfoCache,
OMPRTL___kmpc_get_hardware_thread_id_in_block);
+ ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
registerAAs(IsModulePass);
@@ -3727,12 +3728,37 @@ struct AAKernelInfoFunction : AAKernelInfo {
CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ bool UsedAssumedInformationFromReachingKernels = false;
if (!IsKernelEntry) {
- updateReachingKernelEntries(A);
updateParallelLevels(A);
+ bool AllReachingKernelsKnown = true;
+ updateReachingKernelEntries(A, AllReachingKernelsKnown);
+ UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
+
if (!ParallelLevels.isValidState())
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ else if (!ReachingKernelEntries.isValidState())
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ else if (!SPMDCompatibilityTracker.empty()) {
+ // Check if all reaching kernels agree on the mode as we can otherwise
+ // not guard instructions. We might not be sure about the mode so we
+ // we cannot fix the internal spmd-zation state either.
+ int SPMD = 0, Generic = 0;
+ for (auto *Kernel : ReachingKernelEntries) {
+ auto &CBAA = A.getAAFor<AAKernelInfo>(
+ *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
+ if (CBAA.SPMDCompatibilityTracker.isValidState() &&
+ CBAA.SPMDCompatibilityTracker.isAssumed())
+ ++SPMD;
+ else
+ ++Generic;
+ if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
+ UsedAssumedInformationFromReachingKernels = true;
+ }
+ if (SPMD != 0 && Generic != 0)
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ }
}
// Callback to check a call instruction.
@@ -3779,7 +3805,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
// If we haven't used any assumed information for the SPMD state we can fix
// it.
if (!UsedAssumedInformationInCheckRWInst &&
- !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed)
+ !UsedAssumedInformationInCheckCallInst &&
+ !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
return StateBefore == getState() ? ChangeStatus::UNCHANGED
@@ -3788,7 +3815,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
private:
/// Update info regarding reaching kernels.
- void updateReachingKernelEntries(Attributor &A) {
+ void updateReachingKernelEntries(Attributor &A,
+ bool &AllReachingKernelsKnown) {
auto PredCallSite = [&](AbstractCallSite ACS) {
Function *Caller = ACS.getInstruction()->getFunction();
@@ -3808,10 +3836,9 @@ private:
return true;
};
- bool AllCallSitesKnown;
if (!A.checkForAllCallSites(PredCallSite, *this,
true /* RequireAllCallSites */,
- AllCallSitesKnown))
+ AllReachingKernelsKnown))
ReachingKernelEntries.indicatePessimisticFixpoint();
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 7402e399a88a..2d717475ce7f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -641,8 +641,7 @@ PartialInlinerImpl::computeOutliningInfo(Function &F) const {
if (!CandidateFound)
return std::unique_ptr<FunctionOutliningInfo>();
- // Do sanity check of the entries: threre should not
- // be any successors (not in the entry set) other than
+ // There should not be any successors (not in the entry set) other than
// {ReturnBlock, NonReturnBlock}
assert(OutliningInfo->Entries[0] == &F.front() &&
"Function Entry must be the first in Entries vector");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index a961c47a7501..b8fac9d47763 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -84,6 +84,7 @@
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
#include <algorithm>
@@ -173,6 +174,9 @@ static cl::opt<bool>
cl::desc("Process functions in a top-down order "
"defined by the profiled call graph when "
"-sample-profile-top-down-load is on."));
+cl::opt<bool>
+ SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
+ cl::desc("Sort profiled recursion by edge weights."));
static cl::opt<bool> ProfileSizeInline(
"sample-profile-inline-size", cl::Hidden, cl::init(false),
@@ -1648,6 +1652,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
SmallVector<uint32_t, 4> Weights;
uint32_t MaxWeight = 0;
Instruction *MaxDestInst;
+ // Since profi treats multiple edges (multiway branches) as a single edge,
+ // we need to distribute the computed weight among the branches. We do
+ // this by evenly splitting the edge weight among destinations.
+ DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity;
+ std::vector<uint64_t> EdgeIndex;
+ if (SampleProfileUseProfi) {
+ EdgeIndex.resize(TI->getNumSuccessors());
+ for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+ const BasicBlock *Succ = TI->getSuccessor(I);
+ EdgeIndex[I] = EdgeMultiplicity[Succ];
+ EdgeMultiplicity[Succ]++;
+ }
+ }
for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
BasicBlock *Succ = TI->getSuccessor(I);
Edge E = std::make_pair(BB, Succ);
@@ -1660,9 +1677,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
Weight = std::numeric_limits<uint32_t>::max();
}
- // Weight is added by one to avoid propagation errors introduced by
- // 0 weights.
- Weights.push_back(static_cast<uint32_t>(Weight + 1));
+ if (!SampleProfileUseProfi) {
+ // Weight is added by one to avoid propagation errors introduced by
+ // 0 weights.
+ Weights.push_back(static_cast<uint32_t>(Weight + 1));
+ } else {
+ // Profi creates proper weights that do not require "+1" adjustments but
+ // we evenly split the weight among branches with the same destination.
+ uint64_t W = Weight / EdgeMultiplicity[Succ];
+ // Rounding up, if needed, so that first branches are hotter.
+ if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
+ W++;
+ Weights.push_back(static_cast<uint32_t>(W));
+ }
if (Weight != 0) {
if (Weight > MaxWeight) {
MaxWeight = Weight;
@@ -1853,7 +1880,13 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
while (!CGI.isAtEnd()) {
- for (ProfiledCallGraphNode *Node : *CGI) {
+ auto Range = *CGI;
+ if (SortProfiledSCC) {
+ // Sort nodes in one SCC based on callsite hotness.
+ scc_member_iterator<ProfiledCallGraph *> SI(*CGI);
+ Range = *SI;
+ }
+ for (auto *Node : Range) {
Function *F = SymbolMap.lookup(Node->Name);
if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
FunctionOrderList.push_back(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 06c9bf650f37..dc55b5a31596 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1727,16 +1727,18 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
(Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *A, *B, *C;
+ Value *A, *B, *C, *X, *Y;
// (~(A | B) & C) | ... --> ...
// (~(A & B) | C) & ... --> ...
// TODO: One use checks are conservative. We just need to check that a total
// number of multiple used values does not exceed reduction
// in operations.
- if (match(Op0, m_c_BinOp(FlippedOpcode,
- m_Not(m_BinOp(Opcode, m_Value(A), m_Value(B))),
- m_Value(C)))) {
+ if (match(Op0,
+ m_c_BinOp(FlippedOpcode,
+ m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A),
+ m_Value(B)))),
+ m_Value(C)))) {
// (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A
// (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A)
if (match(Op1,
@@ -1776,6 +1778,21 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)))))))
return BinaryOperator::CreateNot(Builder.CreateBinOp(
Opcode, Builder.CreateBinOp(FlippedOpcode, A, C), B));
+
+ // (~(A | B) & C) | ~(C | (A ^ B)) --> ~((A | B) & (C | (A ^ B)))
+ // Note, the pattern with swapped and/or is not handled because the
+ // result is more undefined than a source:
+ // (~(A & B) | C) & ~(C & (A ^ B)) --> (A ^ B ^ C) | ~(A | C) is invalid.
+ if (Opcode == Instruction::Or && Op0->hasOneUse() &&
+ match(Op1, m_OneUse(m_Not(m_CombineAnd(
+ m_Value(Y),
+ m_c_BinOp(Opcode, m_Specific(C),
+ m_c_Xor(m_Specific(A), m_Specific(B)))))))) {
+ // X = ~(A | B)
+ // Y = (C | (A ^ B)
+ Value *Or = cast<BinaryOperator>(X)->getOperand(0);
+ return BinaryOperator::CreateNot(Builder.CreateAnd(Or, Y));
+ }
}
return nullptr;
@@ -2061,7 +2078,14 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
return CastedAnd;
+ if (Instruction *Sel = foldBinopOfSextBoolToSelect(I))
+ return Sel;
+
// and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
+ // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold
+ // with binop identity constant. But creating a select with non-constant
+ // arm may not be reversible due to poison semantics. Is that a good
+ // canonicalization?
Value *A;
if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
A->getType()->isIntOrIntVectorTy(1))
@@ -2322,11 +2346,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
Value *Cond;
Value *NotB;
if (match(A, m_SExt(m_Value(Cond))) &&
- Cond->getType()->isIntOrIntVectorTy(1) &&
- match(B, m_OneUse(m_Not(m_Value(NotB))))) {
- NotB = peekThroughBitcast(NotB, true);
- if (match(NotB, m_SExt(m_Specific(Cond))))
+ Cond->getType()->isIntOrIntVectorTy(1)) {
+ // A = sext i1 Cond; B = sext (not (i1 Cond))
+ if (match(B, m_SExt(m_Not(m_Specific(Cond)))))
return Cond;
+
+ // A = sext i1 Cond; B = not ({bitcast} (sext (i1 Cond)))
+ // TODO: The one-use checks are unnecessary or misplaced. If the caller
+ // checked for uses on logic ops/casts, that should be enough to
+ // make this transform worthwhile.
+ if (match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+ NotB = peekThroughBitcast(NotB, true);
+ if (match(NotB, m_SExt(m_Specific(Cond))))
+ return Cond;
+ }
}
// All scalar (and most vector) possibilities should be handled now.
@@ -2569,7 +2602,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (I.getType()->isIntOrIntVectorTy(1)) {
+ Type *Ty = I.getType();
+ if (Ty->isIntOrIntVectorTy(1)) {
if (auto *SI0 = dyn_cast<SelectInst>(Op0)) {
if (auto *I =
foldAndOrOfSelectUsingImpliedCond(Op1, *SI0, /* IsAnd */ false))
@@ -2602,7 +2636,16 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
// (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
// The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
Value *Or = Builder.CreateOr(X, Y);
- return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
+ return BinaryOperator::CreateXor(Or, ConstantInt::get(Ty, *CV));
+ }
+
+ // If the operands have no common bits set:
+ // or (mul X, Y), X --> add (mul X, Y), X --> mul X, (Y + 1)
+ if (match(&I,
+ m_c_Or(m_OneUse(m_Mul(m_Value(X), m_Value(Y))), m_Deferred(X))) &&
+ haveNoCommonBitsSet(Op0, Op1, DL)) {
+ Value *IncrementY = Builder.CreateAdd(Y, ConstantInt::get(Ty, 1));
+ return BinaryOperator::CreateMul(X, IncrementY);
}
// (A & C) | (B & D)
@@ -2635,14 +2678,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
// iff (C0 & C1) == 0 and (X & ~C0) == 0
if (match(A, m_c_Or(m_Value(X), m_Specific(B))) &&
MaskedValueIsZero(X, ~*C0, 0, &I)) {
- Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+ Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
return BinaryOperator::CreateAnd(A, C01);
}
// (A & C0) | ((X | A) & C1) --> (X | A) & (C0 | C1)
// iff (C0 & C1) == 0 and (X & ~C1) == 0
if (match(B, m_c_Or(m_Value(X), m_Specific(A))) &&
MaskedValueIsZero(X, ~*C1, 0, &I)) {
- Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+ Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
return BinaryOperator::CreateAnd(B, C01);
}
// ((X | C2) & C0) | ((X | C3) & C1) --> (X | C2 | C3) & (C0 | C1)
@@ -2652,7 +2695,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
match(B, m_Or(m_Specific(X), m_APInt(C3))) &&
(*C2 & ~*C0).isZero() && (*C3 & ~*C1).isZero()) {
Value *Or = Builder.CreateOr(X, *C2 | *C3, "bitfield");
- Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+ Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
return BinaryOperator::CreateAnd(Or, C01);
}
}
@@ -2788,13 +2831,20 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
return CastedOr;
+ if (Instruction *Sel = foldBinopOfSextBoolToSelect(I))
+ return Sel;
+
// or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+ // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold
+ // with binop identity constant. But creating a select with non-constant
+ // arm may not be reversible due to poison semantics. Is that a good
+ // canonicalization?
if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
A->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+ return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op1);
if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
A->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+ return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op0);
// Note: If we've gotten to the point of visiting the outer OR, then the
// inner one couldn't be simplified. If it was a constant, then it won't
@@ -2826,7 +2876,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
// or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X) --> X s> Y ? -1 : X.
{
Value *X, *Y;
- Type *Ty = I.getType();
if (match(&I, m_c_Or(m_OneUse(m_AShr(
m_NSWSub(m_Value(Y), m_Value(X)),
m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
@@ -2876,7 +2925,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (match(&I, m_c_Or(m_Add(m_Shl(m_One(), m_Value(X)), m_AllOnes()),
m_Shl(m_One(), m_Deferred(X)))) &&
match(&I, m_c_Or(m_OneUse(m_Value()), m_Value()))) {
- Type *Ty = X->getType();
Value *Sub = Builder.CreateSub(
ConstantInt::get(Ty, Ty->getScalarSizeInBits() - 1), X);
return BinaryOperator::CreateLShr(Constant::getAllOnesValue(Ty), Sub);
@@ -3601,6 +3649,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A))))
return BinaryOperator::CreateOr(A, B);
+ // (~A | B) ^ A --> ~(A & B)
+ if (match(Op0, m_OneUse(m_c_Or(m_Not(m_Specific(Op1)), m_Value(B)))))
+ return BinaryOperator::CreateNot(Builder.CreateAnd(Op1, B));
+
+ // A ^ (~A | B) --> ~(A & B)
+ if (match(Op1, m_OneUse(m_c_Or(m_Not(m_Specific(Op0)), m_Value(B)))))
+ return BinaryOperator::CreateNot(Builder.CreateAnd(Op0, B));
+
// (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants.
// TODO: Loosen one-use restriction if common operand is a constant.
Value *D;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index bfa7bfa2290a..7da2669e1d13 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2641,7 +2641,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
ArgNo++;
}
- assert(ArgNo == Call.arg_size() && "sanity check");
+ assert(ArgNo == Call.arg_size() && "Call arguments not processed correctly.");
if (!ArgNos.empty()) {
AttributeList AS = Call.getAttributes();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ca87477c5d81..33f217659c01 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2771,7 +2771,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))),
m_Value(Y), m_ConstantInt(IndexC)))) &&
DestTy->isIntegerTy() && X->getType() == DestTy &&
- isDesirableIntType(BitWidth)) {
+ Y->getType()->isIntegerTy() && isDesirableIntType(BitWidth)) {
// Adjust for big endian - the LSBs are at the high index.
if (DL.isBigEndian())
IndexC = SrcVTy->getNumElements() - 1 - IndexC;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7a9e177f19da..ed53b88aed61 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -14,6 +14,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1894,23 +1895,6 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
}
- // (X & C2) == 0 -> (trunc X) >= 0
- // (X & C2) != 0 -> (trunc X) < 0
- // iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
- const APInt *C2;
- if (And->hasOneUse() && C.isZero() && match(Y, m_APInt(C2))) {
- int32_t ExactLogBase2 = C2->exactLogBase2();
- if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
- Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
- if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
- NTy = VectorType::get(NTy, AndVTy->getElementCount());
- Value *Trunc = Builder.CreateTrunc(X, NTy);
- auto NewPred =
- Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE : CmpInst::ICMP_SLT;
- return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
- }
- }
-
return nullptr;
}
@@ -2803,7 +2787,8 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
PredB, cast<Constant>(RHS2));
if (!FlippedStrictness)
return false;
- assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+ assert(FlippedStrictness->first == ICmpInst::ICMP_SGE &&
+ "basic correctness failure");
RHS2 = FlippedStrictness->second;
// And kind-of perform the result swap.
std::swap(Less, Greater);
@@ -4614,7 +4599,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
InstCombiner::BuilderTy &Builder) {
- const ICmpInst::Predicate Pred = ICmp.getPredicate();
+ ICmpInst::Predicate Pred = ICmp.getPredicate();
Value *Op0 = ICmp.getOperand(0), *Op1 = ICmp.getOperand(1);
// Try to canonicalize trunc + compare-to-constant into a mask + cmp.
@@ -4624,41 +4609,31 @@ static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
if (!match(Op0, m_OneUse(m_Trunc(m_Value(X)))) || !match(Op1, m_APInt(C)))
return nullptr;
+ // This matches patterns corresponding to tests of the signbit as well as:
+ // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?)
+ // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?)
+ APInt Mask;
+ if (decomposeBitTestICmp(Op0, Op1, Pred, X, Mask, true /* WithTrunc */)) {
+ Value *And = Builder.CreateAnd(X, Mask);
+ Constant *Zero = ConstantInt::getNullValue(X->getType());
+ return new ICmpInst(Pred, And, Zero);
+ }
+
unsigned SrcBits = X->getType()->getScalarSizeInBits();
- if (Pred == ICmpInst::ICMP_ULT) {
- if (C->isPowerOf2()) {
- // If C is a power-of-2 (one set bit):
- // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?)
- Constant *MaskC = ConstantInt::get(X->getType(), (-*C).zext(SrcBits));
- Value *And = Builder.CreateAnd(X, MaskC);
- Constant *Zero = ConstantInt::getNullValue(X->getType());
- return new ICmpInst(ICmpInst::ICMP_EQ, And, Zero);
- }
+ if (Pred == ICmpInst::ICMP_ULT && C->isNegatedPowerOf2()) {
// If C is a negative power-of-2 (high-bit mask):
// (trunc X) u< C --> (X & C) != C (are any masked-high-bits clear?)
- if (C->isNegatedPowerOf2()) {
- Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits));
- Value *And = Builder.CreateAnd(X, MaskC);
- return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC);
- }
+ Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits));
+ Value *And = Builder.CreateAnd(X, MaskC);
+ return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC);
}
- if (Pred == ICmpInst::ICMP_UGT) {
- // If C is a low-bit-mask (C+1 is a power-of-2):
- // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?)
- if (C->isMask()) {
- Constant *MaskC = ConstantInt::get(X->getType(), (~*C).zext(SrcBits));
- Value *And = Builder.CreateAnd(X, MaskC);
- Constant *Zero = ConstantInt::getNullValue(X->getType());
- return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
- }
+ if (Pred == ICmpInst::ICMP_UGT && (~*C).isPowerOf2()) {
// If C is not-of-power-of-2 (one clear bit):
// (trunc X) u> C --> (X & (C+1)) == C+1 (are all masked-high-bits set?)
- if ((~*C).isPowerOf2()) {
- Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits));
- Value *And = Builder.CreateAnd(X, MaskC);
- return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC);
- }
+ Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits));
+ Value *And = Builder.CreateAnd(X, MaskC);
+ return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC);
}
return nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 72e1b21e8d49..20c75188ec9f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -319,6 +319,7 @@ private:
Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+ Instruction *foldBinopOfSextBoolToSelect(BinaryOperator &I);
Instruction *narrowBinOp(TruncInst &Trunc);
Instruction *narrowMaskedBinOp(BinaryOperator &And);
Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 7dc516c6fdc3..42ba4a34a5a9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -403,7 +403,7 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
NonNegatedOps.emplace_back(Op); // Just record which operand that was.
}
assert((NegatedOps.size() + NonNegatedOps.size()) == 2 &&
- "Internal consistency sanity check.");
+ "Internal consistency check failed.");
// Did we manage to sink negation into both of the operands?
if (NegatedOps.size() == 2) // Then we get to keep the `add`!
return Builder.CreateAdd(NegatedOps[0], NegatedOps[1],
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 4a1e82ae9c1d..518d3952dce5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -246,12 +246,16 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
static unsigned getSelectFoldableOperands(BinaryOperator *I) {
switch (I->getOpcode()) {
case Instruction::Add:
+ case Instruction::FAdd:
case Instruction::Mul:
+ case Instruction::FMul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
return 3; // Can fold through either operand.
case Instruction::Sub: // Can only fold on the amount subtracted.
+ case Instruction::FSub:
+ case Instruction::FDiv: // Can only fold on the divisor amount.
case Instruction::Shl: // Can only fold on the shift amount.
case Instruction::LShr:
case Instruction::AShr:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 47b6dcb67a78..1f81624f79e7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -967,6 +967,29 @@ Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
return nullptr;
}
+/// A binop with a constant operand and a sign-extended boolean operand may be
+/// converted into a select of constants by applying the binary operation to
+/// the constant with the two possible values of the extended boolean (0 or -1).
+Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
+ // TODO: Handle non-commutative binop (constant is operand 0).
+ // TODO: Handle zext.
+ // TODO: Peek through 'not' of cast.
+ Value *BO0 = BO.getOperand(0);
+ Value *BO1 = BO.getOperand(1);
+ Value *X;
+ Constant *C;
+ if (!match(BO0, m_SExt(m_Value(X))) || !match(BO1, m_ImmConstant(C)) ||
+ !X->getType()->isIntOrIntVectorTy(1))
+ return nullptr;
+
+ // bo (sext i1 X), C --> select X, (bo -1, C), (bo 0, C)
+ Constant *Ones = ConstantInt::getAllOnesValue(BO.getType());
+ Constant *Zero = ConstantInt::getNullValue(BO.getType());
+ Constant *TVal = ConstantExpr::get(BO.getOpcode(), Ones, C);
+ Constant *FVal = ConstantExpr::get(BO.getOpcode(), Zero, C);
+ return SelectInst::Create(X, TVal, FVal);
+}
+
static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
InstCombiner::BuilderTy &Builder) {
if (auto *Cast = dyn_cast<CastInst>(&I))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b56329ad76ae..bd2dc8d639fc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -6,7 +6,8 @@
//
//===----------------------------------------------------------------------===//
//
-// This file is a part of AddressSanitizer, an address sanity checker.
+// This file is a part of AddressSanitizer, an address basic correctness
+// checker.
// Details of the algorithm:
// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
//
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 62c265e40dab..8d3bc1383e96 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This file is a part of HWAddressSanitizer, an address sanity checker
-/// based on tagged addressing.
+/// This file is a part of HWAddressSanitizer, an address basic correctness
+/// checker based on tagged addressing.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 36a66e096382..d1d3b8ffdf7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -64,10 +64,10 @@ cl::opt<bool> DoHashBasedCounterSplit(
cl::desc("Rename counter variable of a comdat function based on cfg hash"),
cl::init(true));
-cl::opt<bool> RuntimeCounterRelocation(
- "runtime-counter-relocation",
- cl::desc("Enable relocating counters at runtime."),
- cl::init(false));
+cl::opt<bool>
+ RuntimeCounterRelocation("runtime-counter-relocation",
+ cl::desc("Enable relocating counters at runtime."),
+ cl::init(false));
cl::opt<bool> ValueProfileStaticAlloc(
"vp-static-alloc",
@@ -331,8 +331,9 @@ private:
// Check whether the loop satisfies the basic conditions needed to perform
// Counter Promotions.
- bool isPromotionPossible(Loop *LP,
- const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
+ bool
+ isPromotionPossible(Loop *LP,
+ const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
// We can't insert into a catchswitch.
if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
return isa<CatchSwitchInst>(Exit->getTerminator());
@@ -421,13 +422,13 @@ PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
}
char InstrProfilingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(
- InstrProfilingLegacyPass, "instrprof",
- "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_BEGIN(InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
- InstrProfilingLegacyPass, "instrprof",
- "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_END(InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.", false,
+ false)
ModulePass *
llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
@@ -634,13 +635,9 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
GlobalVariable *Name = Ind->getName();
uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
uint64_t Index = Ind->getIndex()->getZExtValue();
- auto It = ProfileDataMap.find(Name);
- if (It == ProfileDataMap.end()) {
- PerFunctionProfileData PD;
- PD.NumValueSites[ValueKind] = Index + 1;
- ProfileDataMap[Name] = PD;
- } else if (It->second.NumValueSites[ValueKind] <= Index)
- It->second.NumValueSites[ValueKind] = Index + 1;
+ auto &PD = ProfileDataMap[Name];
+ PD.NumValueSites[ValueKind] =
+ std::max(PD.NumValueSites[ValueKind], (uint32_t)(Index + 1));
}
void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
@@ -703,14 +700,15 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
LoadInst *LI = dyn_cast<LoadInst>(&I);
if (!LI) {
IRBuilder<> Builder(&I);
- GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
+ GlobalVariable *Bias =
+ M->getGlobalVariable(getInstrProfCounterBiasVarName());
if (!Bias) {
// Compiler must define this variable when runtime counter relocation
// is being used. Runtime has a weak external reference that is used
// to check whether that's the case or not.
- Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
- Constant::getNullValue(Int64Ty),
- getInstrProfCounterBiasVarName());
+ Bias = new GlobalVariable(
+ *M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
+ Constant::getNullValue(Int64Ty), getInstrProfCounterBiasVarName());
Bias->setVisibility(GlobalVariable::HiddenVisibility);
// A definition that's weak (linkonce_odr) without being in a COMDAT
// section wouldn't lead to link errors, but it would lead to a dead
@@ -839,8 +837,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
return false;
// Use linker script magic to get data/cnts/name start/end.
if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
- TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
- TT.isOSWindows())
+ TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || TT.isOSWindows())
return false;
return true;
@@ -849,13 +846,9 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
GlobalVariable *
InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
GlobalVariable *NamePtr = Inc->getName();
- auto It = ProfileDataMap.find(NamePtr);
- PerFunctionProfileData PD;
- if (It != ProfileDataMap.end()) {
- if (It->second.RegionCounters)
- return It->second.RegionCounters;
- PD = It->second;
- }
+ auto &PD = ProfileDataMap[NamePtr];
+ if (PD.RegionCounters)
+ return PD.RegionCounters;
// Match the linkage and visibility of the name global.
Function *Fn = Inc->getParent()->getParent();
@@ -922,6 +915,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
CounterPtr->setAlignment(Align(8));
MaybeSetComdat(CounterPtr);
CounterPtr->setLinkage(Linkage);
+ PD.RegionCounters = CounterPtr;
auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
// Allocate statically the array of pointers to value profile nodes for
@@ -1000,9 +994,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
MaybeSetComdat(Data);
Data->setLinkage(Linkage);
- PD.RegionCounters = CounterPtr;
PD.DataVar = Data;
- ProfileDataMap[NamePtr] = PD;
// Mark the data variable as used so that it isn't stripped out.
CompilerUsedVars.push_back(Data);
@@ -1013,7 +1005,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
// Collect the referenced names to be used by emitNameData.
ReferencedNames.push_back(NamePtr);
- return CounterPtr;
+ return PD.RegionCounters;
}
void InstrProfiling::emitVNodes() {
@@ -1078,8 +1070,8 @@ void InstrProfiling::emitNameData() {
}
auto &Ctx = M->getContext();
- auto *NamesVal = ConstantDataArray::getString(
- Ctx, StringRef(CompressedNameStr), false);
+ auto *NamesVal =
+ ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
GlobalValue::PrivateLinkage, NamesVal,
getInstrProfNamesVarName());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index f98e39d751f4..180012198c42 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -110,7 +110,7 @@ namespace {
/// the module.
struct ThreadSanitizer {
ThreadSanitizer() {
- // Sanity check options and warn user.
+ // Check options and warn user.
if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) {
errs()
<< "warning: Option -tsan-compound-read-before-write has no effect "
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 74e4eb07b219..4921209f041b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -94,11 +94,9 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
return false;
} else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
// For calls, just check the arguments (and not the callee operand).
- for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
- const Value *Op = *OI;
+ for (const Value *Op : CS->args())
if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
return true;
- }
return false;
} else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
// Special-case stores, because we don't care about the stored value, just
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index ca9567dc7ac8..a3fd97079b1d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -52,6 +52,11 @@ using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
+static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned(
+ "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden,
+ cl::desc("Enables canonicalization of signed relational predicates to "
+ "unsigned (e.g. sgt => ugt)"));
+
STATISTIC(NumPhis, "Number of phis propagated");
STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
STATISTIC(NumSelects, "Number of selects propagated");
@@ -64,7 +69,8 @@ STATISTIC(NumSDivSRemsNarrowed,
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumUDivURemsNarrowed,
"Number of udivs/urems whose width was decreased");
-STATISTIC(NumAShrs, "Number of ashr converted to lshr");
+STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr");
+STATISTIC(NumAShrsRemoved, "Number of ashr removed");
STATISTIC(NumSRems, "Number of srem converted to urem");
STATISTIC(NumSExt, "Number of sext converted to zext");
STATISTIC(NumSICmps, "Number of signed icmp preds simplified to unsigned");
@@ -297,6 +303,9 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
}
static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
+ if (!CanonicalizeICmpPredicatesToUnsigned)
+ return false;
+
// Only for signed relational comparisons of scalar integers.
if (Cmp->getType()->isVectorTy() ||
!Cmp->getOperand(0)->getType()->isIntegerTy())
@@ -376,13 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
// ConstantFoldTerminator() as the underlying SwitchInst can be changed.
SwitchInstProfUpdateWrapper SI(*I);
- APInt Low =
- APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits());
- APInt High =
- APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits());
-
- SwitchInst::CaseIt CI = SI->case_begin();
- for (auto CE = SI->case_end(); CI != CE;) {
+ for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
ConstantInt *Case = CI->getCaseValue();
LazyValueInfo::Tristate State =
LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
@@ -415,28 +418,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
break;
}
- // Get Lower/Upper bound from switch cases.
- Low = APIntOps::smin(Case->getValue(), Low);
- High = APIntOps::smax(Case->getValue(), High);
-
// Increment the case iterator since we didn't delete it.
++CI;
}
-
- // Try to simplify default case as unreachable
- if (CI == SI->case_end() && SI->getNumCases() != 0 &&
- !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) {
- const ConstantRange SIRange =
- LVI->getConstantRange(SI->getCondition(), SI);
-
- // If the numbered switch cases cover the entire range of the condition,
- // then the default case is not reachable.
- if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High &&
- SI->getNumCases() == High - Low + 1) {
- createUnreachableSwitchDefault(SI, &DTU);
- Changed = true;
- }
- }
}
if (Changed)
@@ -688,7 +672,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
ArgNo++;
}
- assert(ArgNo == CB.arg_size() && "sanity check");
+ assert(ArgNo == CB.arg_size() && "Call arguments not processed correctly.");
if (ArgNos.empty())
return Changed;
@@ -954,10 +938,22 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy())
return false;
+ ConstantRange LRange = LVI->getConstantRange(SDI->getOperand(0), SDI);
+ unsigned OrigWidth = SDI->getType()->getIntegerBitWidth();
+ ConstantRange NegOneOrZero =
+ ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1));
+ if (NegOneOrZero.contains(LRange)) {
+ // ashr of -1 or 0 never changes the value, so drop the whole instruction
+ ++NumAShrsRemoved;
+ SDI->replaceAllUsesWith(SDI->getOperand(0));
+ SDI->eraseFromParent();
+ return true;
+ }
+
if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
return false;
- ++NumAShrs;
+ ++NumAShrsConverted;
auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
BO->setDebugLoc(SDI->getDebugLoc());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a8ec8bb97970..e0d3a6accadd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -159,52 +159,22 @@ static cl::opt<unsigned> MemorySSAPathCheckLimit(
cl::desc("The maximum number of blocks to check when trying to prove that "
"all paths to an exit go through a killing block (default = 50)"));
+// This flags allows or disallows DSE to optimize MemorySSA during its
+// traversal. Note that DSE optimizing MemorySSA may impact other passes
+// downstream of the DSE invocation and can lead to issues not being
+// reproducible in isolation (i.e. when MemorySSA is built from scratch). In
+// those cases, the flag can be used to check if DSE's MemorySSA optimizations
+// impact follow-up passes.
+static cl::opt<bool>
+ OptimizeMemorySSA("dse-optimize-memoryssa", cl::init(true), cl::Hidden,
+ cl::desc("Allow DSE to optimize memory accesses."));
+
//===----------------------------------------------------------------------===//
// Helper functions
//===----------------------------------------------------------------------===//
using OverlapIntervalsTy = std::map<int64_t, int64_t>;
using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
-/// Does this instruction write some memory? This only returns true for things
-/// that we can analyze with other helpers below.
-static bool hasAnalyzableMemoryWrite(Instruction *I,
- const TargetLibraryInfo &TLI) {
- if (isa<StoreInst>(I))
- return true;
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::memset:
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- case Intrinsic::memcpy_inline:
- case Intrinsic::memcpy_element_unordered_atomic:
- case Intrinsic::memmove_element_unordered_atomic:
- case Intrinsic::memset_element_unordered_atomic:
- case Intrinsic::init_trampoline:
- case Intrinsic::lifetime_end:
- case Intrinsic::masked_store:
- return true;
- }
- }
- if (auto *CB = dyn_cast<CallBase>(I)) {
- LibFunc LF;
- if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
- switch (LF) {
- case LibFunc_strcpy:
- case LibFunc_strncpy:
- case LibFunc_strcat:
- case LibFunc_strncat:
- return true;
- default:
- return false;
- }
- }
- }
- return false;
-}
-
/// If the value of this instruction and the memory it writes to is unused, may
/// we delete this instruction?
static bool isRemovable(Instruction *I) {
@@ -214,7 +184,7 @@ static bool isRemovable(Instruction *I) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
- default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
+ default: llvm_unreachable("Does not have LocForWrite");
case Intrinsic::lifetime_end:
// Never remove dead lifetime_end's, e.g. because it is followed by a
// free.
@@ -296,6 +266,7 @@ enum OverwriteResult {
OW_End,
OW_PartialEarlierWithFullLater,
OW_MaybePartial,
+ OW_None,
OW_Unknown
};
@@ -841,7 +812,7 @@ struct DSEState {
/// Keep track of instructions (partly) overlapping with killing MemoryDefs per
/// basic block.
- DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+ MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs;
// Class contains self-reference, make sure it's not copied/moved.
DSEState(const DSEState &) = delete;
@@ -889,6 +860,7 @@ struct DSEState {
/// Return OW_MaybePartial if \p KillingI does not completely overwrite
/// \p DeadI, but they both write to the same underlying object. In that
/// case, use isPartialOverwrite to check if \p KillingI partially overwrites
+ /// \p DeadI. Returns 'OR_None' if \p KillingI is known to not overwrite the
/// \p DeadI. Returns 'OW_Unknown' if nothing can be determined.
OverwriteResult isOverwrite(const Instruction *KillingI,
const Instruction *DeadI,
@@ -951,8 +923,16 @@ struct DSEState {
// If we can't resolve the same pointers to the same object, then we can't
// analyze them at all.
- if (DeadUndObj != KillingUndObj)
+ if (DeadUndObj != KillingUndObj) {
+ // Non aliasing stores to different objects don't overlap. Note that
+ // if the killing store is known to overwrite whole object (out of
+ // bounds access overwrites whole object as well) then it is assumed to
+ // completely overwrite any store to the same object even if they don't
+ // actually alias (see next check).
+ if (AAR == AliasResult::NoAlias)
+ return OW_None;
return OW_Unknown;
+ }
// If the KillingI store is to a recognizable object, get its size.
uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
@@ -1006,9 +986,8 @@ struct DSEState {
return OW_MaybePartial;
}
- // Can reach here only if accesses are known not to overlap. There is no
- // dedicated code to indicate no overlap so signal "unknown".
- return OW_Unknown;
+ // Can reach here only if accesses are known not to overlap.
+ return OW_None;
}
bool isInvisibleToCallerAfterRet(const Value *V) {
@@ -1304,6 +1283,15 @@ struct DSEState {
Instruction *KillingI = KillingDef->getMemoryInst();
LLVM_DEBUG(dbgs() << " trying to get dominating access\n");
+ // Only optimize defining access of KillingDef when directly starting at its
+ // defining access. The defining access also must only access KillingLoc. At
+ // the moment we only support instructions with a single write location, so
+ // it should be sufficient to disable optimizations for instructions that
+ // also read from memory.
+ bool CanOptimize = OptimizeMemorySSA &&
+ KillingDef->getDefiningAccess() == StartAccess &&
+ !KillingI->mayReadFromMemory();
+
// Find the next clobbering Mod access for DefLoc, starting at StartAccess.
Optional<MemoryLocation> CurrentLoc;
for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) {
@@ -1345,8 +1333,10 @@ struct DSEState {
Instruction *CurrentI = CurrentDef->getMemoryInst();
if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj),
- TLI))
+ TLI)) {
+ CanOptimize = false;
continue;
+ }
// Before we try to remove anything, check for any extra throwing
// instructions that block us from DSEing
@@ -1380,15 +1370,13 @@ struct DSEState {
return None;
}
- // If Current cannot be analyzed or is not removable, check the next
- // candidate.
- if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI))
- continue;
-
- // If Current does not have an analyzable write location, skip it
+ // If Current does not have an analyzable write location or is not
+ // removable, skip it.
CurrentLoc = getLocForWriteEx(CurrentI);
- if (!CurrentLoc)
+ if (!CurrentLoc || !isRemovable(CurrentI)) {
+ CanOptimize = false;
continue;
+ }
// AliasAnalysis does not account for loops. Limit elimination to
// candidates for which we can guarantee they always store to the same
@@ -1396,6 +1384,7 @@ struct DSEState {
if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n");
WalkerStepLimit -= 1;
+ CanOptimize = false;
continue;
}
@@ -1403,16 +1392,32 @@ struct DSEState {
// If the killing def is a memory terminator (e.g. lifetime.end), check
// the next candidate if the current Current does not write the same
// underlying object as the terminator.
- if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI))
+ if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) {
+ CanOptimize = false;
continue;
+ }
} else {
int64_t KillingOffset = 0;
int64_t DeadOffset = 0;
auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc,
KillingOffset, DeadOffset);
+ if (CanOptimize) {
+ // CurrentDef is the earliest write clobber of KillingDef. Use it as
+ // optimized access. Do not optimize if CurrentDef is already the
+ // defining access of KillingDef.
+ if (CurrentDef != KillingDef->getDefiningAccess() &&
+ (OR == OW_Complete || OR == OW_MaybePartial))
+ KillingDef->setOptimized(CurrentDef);
+
+ // Once a may-aliasing def is encountered do not set an optimized
+ // access.
+ if (OR != OW_None)
+ CanOptimize = false;
+ }
+
// If Current does not write to the same object as KillingDef, check
// the next candidate.
- if (OR == OW_Unknown)
+ if (OR == OW_Unknown || OR == OW_None)
continue;
else if (OR == OW_MaybePartial) {
// If KillingDef only partially overwrites Current, check the next
@@ -1421,6 +1426,7 @@ struct DSEState {
// which are less likely to be removable in the end.
if (PartialLimit <= 1) {
WalkerStepLimit -= 1;
+ LLVM_DEBUG(dbgs() << " ... reached partial limit ... continue with next access\n");
continue;
}
PartialLimit -= 1;
@@ -1922,7 +1928,14 @@ struct DSEState {
if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) ||
!isRemovable(Def->getMemoryInst()))
continue;
- auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
+ MemoryDef *UpperDef;
+ // To conserve compile-time, we avoid walking to the next clobbering def.
+ // Instead, we just try to get the optimized access, if it exists. DSE
+ // will try to optimize defs during the earlier traversal.
+ if (Def->isOptimized())
+ UpperDef = dyn_cast<MemoryDef>(Def->getOptimized());
+ else
+ UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
continue;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index ae2fe2767074..7001d330fce0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1951,7 +1951,6 @@ bool IndVarSimplify::run(Loop *L) {
// using it.
if (!DisableLFTR) {
BasicBlock *PreHeader = L->getLoopPreheader();
- BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
SmallVector<BasicBlock*, 16> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
@@ -1987,7 +1986,7 @@ bool IndVarSimplify::run(Loop *L) {
// Avoid high cost expansions. Note: This heuristic is questionable in
// that our definition of "high cost" is not exactly principled.
if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
- TTI, PreHeaderBR))
+ TTI, PreHeader->getTerminator()))
continue;
// Check preconditions for proper SCEVExpander operation. SCEV does not
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
index bf714d167670..6f97f3e93123 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -486,7 +486,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
// Check that neither this loop nor its parent have had LCSSA broken. LICM is
// specifically moving instructions across the loop boundary and so it is
- // especially in need of sanity checking here.
+ // especially in need of basic functional correctness checking here.
assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
"Parent loop not left in LCSSA form after LICM!");
@@ -1860,6 +1860,7 @@ class LoopPromoter : public LoadAndStorePromoter {
bool UnorderedAtomic;
AAMDNodes AATags;
ICFLoopSafetyInfo &SafetyInfo;
+ bool CanInsertStoresInExitBlocks;
// We're about to add a use of V in a loop exit block. Insert an LCSSA phi
// (if legal) if doing so would add an out-of-loop use to an instruction
@@ -1886,12 +1887,13 @@ public:
SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl,
Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
- ICFLoopSafetyInfo &SafetyInfo)
+ ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks)
: LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
- SafetyInfo(SafetyInfo) {}
+ SafetyInfo(SafetyInfo),
+ CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks) {}
bool isInstInList(Instruction *I,
const SmallVectorImpl<Instruction *> &) const override {
@@ -1903,7 +1905,7 @@ public:
return PointerMustAliases.count(Ptr);
}
- void doExtraRewritesBeforeFinalDeletion() override {
+ void insertStoresInLoopExitBlocks() {
// Insert stores after in the loop exit blocks. Each exit block gets a
// store of the live-out values that feed them. Since we've already told
// the SSA updater about the defs in the loop and the preheader
@@ -1937,10 +1939,21 @@ public:
}
}
+ void doExtraRewritesBeforeFinalDeletion() override {
+ if (CanInsertStoresInExitBlocks)
+ insertStoresInLoopExitBlocks();
+ }
+
void instructionDeleted(Instruction *I) const override {
SafetyInfo.removeInstruction(I);
MSSAU->removeMemoryAccess(I);
}
+
+ bool shouldDelete(Instruction *I) const override {
+ if (isa<StoreInst>(I))
+ return CanInsertStoresInExitBlocks;
+ return true;
+ }
};
bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L,
@@ -2039,6 +2052,7 @@ bool llvm::promoteLoopAccessesToScalars(
bool DereferenceableInPH = false;
bool SafeToInsertStore = false;
+ bool FoundLoadToPromote = false;
SmallVector<Instruction *, 64> LoopUses;
@@ -2067,16 +2081,11 @@ bool llvm::promoteLoopAccessesToScalars(
IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
}
- // Check that all of the pointers in the alias set have the same type. We
- // cannot (yet) promote a memory location that is loaded and stored in
+ // Check that all accesses to pointers in the aliass set use the same type.
+ // We cannot (yet) promote a memory location that is loaded and stored in
// different sizes. While we are at it, collect alignment and AA info.
+ Type *AccessTy = nullptr;
for (Value *ASIV : PointerMustAliases) {
- // Check that all of the pointers in the alias set have the same type. We
- // cannot (yet) promote a memory location that is loaded and stored in
- // different sizes.
- if (SomePtr->getType() != ASIV->getType())
- return false;
-
for (User *U : ASIV->users()) {
// Ignore instructions that are outside the loop.
Instruction *UI = dyn_cast<Instruction>(U);
@@ -2091,6 +2100,7 @@ bool llvm::promoteLoopAccessesToScalars(
SawUnorderedAtomic |= Load->isAtomic();
SawNotAtomic |= !Load->isAtomic();
+ FoundLoadToPromote = true;
Align InstAlignment = Load->getAlign();
@@ -2153,6 +2163,11 @@ bool llvm::promoteLoopAccessesToScalars(
} else
return false; // Not a load or store.
+ if (!AccessTy)
+ AccessTy = getLoadStoreType(UI);
+ else if (AccessTy != getLoadStoreType(UI))
+ return false;
+
// Merge the AA tags.
if (LoopUses.empty()) {
// On the first load/store, just take its AA tags.
@@ -2175,9 +2190,7 @@ bool llvm::promoteLoopAccessesToScalars(
// If we're inserting an atomic load in the preheader, we must be able to
// lower it. We're only guaranteed to be able to lower naturally aligned
// atomics.
- auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
- if (SawUnorderedAtomic &&
- Alignment < MDL.getTypeStoreSize(SomePtrElemType))
+ if (SawUnorderedAtomic && Alignment < MDL.getTypeStoreSize(AccessTy))
return false;
// If we couldn't prove we can hoist the load, bail.
@@ -2199,13 +2212,20 @@ bool llvm::promoteLoopAccessesToScalars(
}
}
- // If we've still failed to prove we can sink the store, give up.
- if (!SafeToInsertStore)
+ // If we've still failed to prove we can sink the store, hoist the load
+ // only, if possible.
+ if (!SafeToInsertStore && !FoundLoadToPromote)
+ // If we cannot hoist the load either, give up.
return false;
- // Otherwise, this is safe to promote, lets do it!
- LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
- << '\n');
+ // Lets do the promotion!
+ if (SafeToInsertStore)
+ LLVM_DEBUG(dbgs() << "LICM: Promoting load/store of the value: " << *SomePtr
+ << '\n');
+ else
+ LLVM_DEBUG(dbgs() << "LICM: Promoting load of the value: " << *SomePtr
+ << '\n');
+
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
LoopUses[0])
@@ -2224,13 +2244,14 @@ bool llvm::promoteLoopAccessesToScalars(
SSAUpdater SSA(&NewPHIs);
LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL,
- Alignment, SawUnorderedAtomic, AATags, *SafetyInfo);
+ Alignment, SawUnorderedAtomic, AATags, *SafetyInfo,
+ SafeToInsertStore);
// Set up the preheader to have a definition of the value. It is the live-out
// value from the preheader that uses in the loop will use.
LoadInst *PreheaderLoad = new LoadInst(
- SomePtr->getType()->getPointerElementType(), SomePtr,
- SomePtr->getName() + ".promoted", Preheader->getTerminator());
+ AccessTy, SomePtr, SomePtr->getName() + ".promoted",
+ Preheader->getTerminator());
if (SawUnorderedAtomic)
PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
PreheaderLoad->setAlignment(Alignment);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 3df4cfe8e4c1..6c783848432b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -49,9 +49,17 @@ void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
LPMUpdater &>::printPipeline(raw_ostream &OS,
function_ref<StringRef(StringRef)>
MapClassName2PassName) {
- for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) {
- auto *P = LoopPasses[Idx].get();
- P->printPipeline(OS, MapClassName2PassName);
+ assert(LoopPasses.size() + LoopNestPasses.size() == IsLoopNestPass.size());
+
+ unsigned IdxLP = 0, IdxLNP = 0;
+ for (unsigned Idx = 0, Size = IsLoopNestPass.size(); Idx != Size; ++Idx) {
+ if (IsLoopNestPass[Idx]) {
+ auto *P = LoopNestPasses[IdxLNP++].get();
+ P->printPipeline(OS, MapClassName2PassName);
+ } else {
+ auto *P = LoopPasses[IdxLP++].get();
+ P->printPipeline(OS, MapClassName2PassName);
+ }
if (Idx + 1 < Size)
OS << ",";
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index a87843d658a9..728d63fe2847 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -256,8 +256,8 @@ private:
}
}
- // Sanity check: amount of dead and live loop blocks should match the total
- // number of blocks in loop.
+ // Amount of dead and live loop blocks should match the total number of
+ // blocks in loop.
assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
"Malformed block sets?");
@@ -305,7 +305,6 @@ private:
BlocksInLoopAfterFolding.insert(BB);
}
- // Sanity check: header must be in loop.
assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
"Header not in loop?");
assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 67702520511b..39c8b65968aa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -806,28 +806,27 @@ static Optional<unsigned> shouldFullUnroll(
ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
const TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(FullUnrollTripCount && "should be non-zero!");
- if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
- // When computing the unrolled size, note that BEInsns are not replicated
- // like the rest of the loop body.
- if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
- return FullUnrollTripCount;
+ if (FullUnrollTripCount > UP.FullUnrollMaxCount)
+ return None;
- } else {
- // The loop isn't that small, but we still can fully unroll it if that
- // helps to remove a significant number of instructions.
- // To check that, run additional analysis on the loop.
- if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, SE, EphValues, TTI,
- UP.Threshold * UP.MaxPercentThresholdBoost / 100,
- UP.MaxIterationsCountToAnalyze)) {
- unsigned Boost =
- getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
- if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
- return FullUnrollTripCount;
- }
- }
- }
+ // When computing the unrolled size, note that BEInsns are not replicated
+ // like the rest of the loop body.
+ if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
+ return FullUnrollTripCount;
+
+ // The loop isn't that small, but we still can fully unroll it if that
+ // helps to remove a significant number of instructions.
+ // To check that, run additional analysis on the loop.
+ if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+ L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+ UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+ UP.MaxIterationsCountToAnalyze)) {
+ unsigned Boost =
+ getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+ if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
+ return FullUnrollTripCount;
}
return None;
}
@@ -837,51 +836,48 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
const UnrollCostEstimator UCE,
const TargetTransformInfo::UnrollingPreferences &UP) {
+ if (!TripCount)
+ return None;
+
+ if (!UP.Partial) {
+ LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
+ return 0;
+ }
unsigned count = UP.Count;
- if (TripCount) {
- if (!UP.Partial) {
- LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
- << "-unroll-allow-partial not given\n");
- count = 0;
- return count;
- }
- if (count == 0)
- count = TripCount;
- if (UP.PartialThreshold != NoThreshold) {
- // Reduce unroll count to be modulo of TripCount for partial unrolling.
- if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
- count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
- (LoopSize - UP.BEInsns);
- if (count > UP.MaxCount)
- count = UP.MaxCount;
- while (count != 0 && TripCount % count != 0)
- count--;
- if (UP.AllowRemainder && count <= 1) {
- // If there is no Count that is modulo of TripCount, set Count to
- // largest power-of-two factor that satisfies the threshold limit.
- // As we'll create fixup loop, do the type of unrolling only if
- // remainder loop is allowed.
- count = UP.DefaultUnrollRuntimeCount;
- while (count != 0 &&
- UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
- count >>= 1;
- }
- if (count < 2) {
- count = 0;
- }
- } else {
- count = TripCount;
- }
+ if (count == 0)
+ count = TripCount;
+ if (UP.PartialThreshold != NoThreshold) {
+ // Reduce unroll count to be modulo of TripCount for partial unrolling.
+ if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+ count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+ (LoopSize - UP.BEInsns);
if (count > UP.MaxCount)
count = UP.MaxCount;
-
- LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
-
- return count;
+ while (count != 0 && TripCount % count != 0)
+ count--;
+ if (UP.AllowRemainder && count <= 1) {
+ // If there is no Count that is modulo of TripCount, set Count to
+ // largest power-of-two factor that satisfies the threshold limit.
+ // As we'll create fixup loop, do the type of unrolling only if
+ // remainder loop is allowed.
+ count = UP.DefaultUnrollRuntimeCount;
+ while (count != 0 &&
+ UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+ count >>= 1;
+ }
+ if (count < 2) {
+ count = 0;
+ }
+ } else {
+ count = TripCount;
}
+ if (count > UP.MaxCount)
+ count = UP.MaxCount;
- // if didn't return until here, should continue to other priorties
- return None;
+ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
+
+ return count;
}
// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.
@@ -900,7 +896,6 @@ bool llvm::computeUnrollCount(
TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
UnrollCostEstimator UCE(*L, LoopSize);
- Optional<unsigned> UnrollFactor;
const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
const bool PragmaFullUnroll = hasUnrollFullPragma(L);
@@ -926,9 +921,8 @@ bool llvm::computeUnrollCount(
// Check for explicit Count.
// 1st priority is unroll count set by "unroll-count" option.
// 2nd priority is unroll count set by pragma.
- UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP);
-
- if (UnrollFactor) {
+ if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
+ UCE, UP)) {
UP.Count = *UnrollFactor;
if (UserUnrollCount || (PragmaCount > 0)) {
@@ -948,11 +942,20 @@ bool llvm::computeUnrollCount(
}
}
- // 3rd priority is full unroll count.
- // Full unroll makes sense only when TripCount or its upper bound could be
- // statically calculated.
- // Also we need to check if we exceed FullUnrollMaxCount.
+ // 3rd priority is exact full unrolling. This will eliminate all copies
+ // of some exit test.
+ UP.Count = 0;
+ if (TripCount) {
+ UP.Count = TripCount;
+ if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
+ TripCount, UCE, UP)) {
+ UP.Count = *UnrollFactor;
+ UseUpperBound = false;
+ return ExplicitUnroll;
+ }
+ }
+ // 4th priority is bounded unrolling.
// We can unroll by the upper bound amount if it's generally allowed or if
// we know that the loop is executed either the upper bound or zero times.
// (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -961,37 +964,21 @@ bool llvm::computeUnrollCount(
// number of loop tests goes up which may end up being worse on targets with
// constrained branch predictor resources so is controlled by an option.)
// In addition we only unroll small upper bounds.
- unsigned FullUnrollMaxTripCount = MaxTripCount;
- if (!(UP.UpperBound || MaxOrZero) ||
- FullUnrollMaxTripCount > UnrollMaxUpperBound)
- FullUnrollMaxTripCount = 0;
-
- // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
- // compute the former when the latter is zero.
- unsigned ExactTripCount = TripCount;
- assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
- "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
-
- unsigned FullUnrollTripCount =
- ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
- UP.Count = FullUnrollTripCount;
-
- UnrollFactor =
- shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP);
-
- // if shouldFullUnroll can do the unrolling, some side parameteres should be
- // set
- if (UnrollFactor) {
- UP.Count = *UnrollFactor;
- UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
- TripCount = FullUnrollTripCount;
- TripMultiple = UP.UpperBound ? 1 : TripMultiple;
- return ExplicitUnroll;
- } else {
- UP.Count = FullUnrollTripCount;
+ // Note that the cost of bounded unrolling is always strictly greater than
+ // cost of exact full unrolling. As such, if we have an exact count and
+ // found it unprofitable, we'll never chose to bounded unroll.
+ if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
+ MaxTripCount <= UnrollMaxUpperBound) {
+ UP.Count = MaxTripCount;
+ if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
+ MaxTripCount, UCE, UP)) {
+ UP.Count = *UnrollFactor;
+ UseUpperBound = true;
+ return ExplicitUnroll;
+ }
}
- // 4th priority is loop peeling.
+ // 5th priority is loop peeling.
computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold);
if (PP.PeelCount) {
UP.Runtime = false;
@@ -1004,11 +991,9 @@ bool llvm::computeUnrollCount(
if (TripCount)
UP.Partial |= ExplicitUnroll;
- // 5th priority is partial unrolling.
+ // 6th priority is partial unrolling.
// Try partial unroll only when TripCount could be statically calculated.
- UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP);
-
- if (UnrollFactor) {
+ if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
UP.Count = *UnrollFactor;
if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
@@ -1049,7 +1034,7 @@ bool llvm::computeUnrollCount(
"because loop has a runtime trip count.";
});
- // 6th priority is runtime unrolling.
+ // 7th priority is runtime unrolling.
// Don't unroll a runtime trip count loop when it is disabled.
if (hasRuntimeUnrollDisablePragma(L)) {
UP.Count = 0;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
index b0fb8daaba8f..c354fa177a60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -494,7 +494,7 @@ static bool LinearizeExprTree(Instruction *I,
SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
#ifndef NDEBUG
- SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
+ SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
#endif
while (!Worklist.empty()) {
std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
@@ -2313,11 +2313,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
MadeChange |= LinearizeExprTree(I, Tree);
SmallVector<ValueEntry, 8> Ops;
Ops.reserve(Tree.size());
- for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
- RepeatedValue E = Tree[i];
- Ops.append(E.second.getZExtValue(),
- ValueEntry(getRank(E.first), E.first));
- }
+ for (const RepeatedValue &E : Tree)
+ Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first));
LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 86d3620c312e..3799d2dd1cf2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -227,8 +227,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
unsigned IterCnt = 0;
(void)IterCnt;
while (LocalChange) {
- assert(IterCnt++ < 1000 &&
- "Sanity: iterative simplification didn't converge!");
+ assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!");
LocalChange = false;
// Loop over all of the basic blocks and remove them if they are unneeded.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6469c899feea..d6d6b1a7fa09 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -235,22 +235,26 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
// These dominator edges will be redirected from Pred.
std::vector<DominatorTree::UpdateType> Updates;
if (DTU) {
- SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB));
+ // To avoid processing the same predecessor more than once.
+ SmallPtrSet<BasicBlock *, 8> SeenSuccs;
SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB),
succ_end(PredBB));
- Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1);
+ Updates.reserve(Updates.size() + 2 * succ_size(BB) + 1);
// Add insert edges first. Experimentally, for the particular case of two
// blocks that can be merged, with a single successor and single predecessor
// respectively, it is beneficial to have all insert updates first. Deleting
// edges first may lead to unreachable blocks, followed by inserting edges
// making the blocks reachable again. Such DT updates lead to high compile
// times. We add inserts before deletes here to reduce compile time.
- for (BasicBlock *SuccOfBB : SuccsOfBB)
+ for (BasicBlock *SuccOfBB : successors(BB))
// This successor of BB may already be a PredBB's successor.
if (!SuccsOfPredBB.contains(SuccOfBB))
- Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
- for (BasicBlock *SuccOfBB : SuccsOfBB)
- Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
+ if (SeenSuccs.insert(SuccOfBB).second)
+ Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
+ SeenSuccs.clear();
+ for (BasicBlock *SuccOfBB : successors(BB))
+ if (SeenSuccs.insert(SuccOfBB).second)
+ Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
Updates.push_back({DominatorTree::Delete, PredBB, BB});
}
@@ -804,14 +808,14 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
if (DTU) {
SmallVector<DominatorTree::UpdateType, 8> Updates;
// Old dominates New. New node dominates all other nodes dominated by Old.
- SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
- succ_end(New));
+ SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld;
Updates.push_back({DominatorTree::Insert, Old, New});
- Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size());
- for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) {
- Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld});
- Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld});
- }
+ Updates.reserve(Updates.size() + 2 * succ_size(New));
+ for (BasicBlock *SuccessorOfOld : successors(New))
+ if (UniqueSuccessorsOfOld.insert(SuccessorOfOld).second) {
+ Updates.push_back({DominatorTree::Insert, New, SuccessorOfOld});
+ Updates.push_back({DominatorTree::Delete, Old, SuccessorOfOld});
+ }
DTU->applyUpdates(Updates);
} else if (DT)
@@ -870,14 +874,14 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
// New dominates Old. The predecessor nodes of the Old node dominate
// New node.
- SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New),
- pred_end(New));
+ SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld;
DTUpdates.push_back({DominatorTree::Insert, New, Old});
- DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size());
- for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) {
- DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New});
- DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old});
- }
+ DTUpdates.reserve(DTUpdates.size() + 2 * pred_size(New));
+ for (BasicBlock *PredecessorOfOld : predecessors(New))
+ if (UniquePredecessorsOfOld.insert(PredecessorOfOld).second) {
+ DTUpdates.push_back({DominatorTree::Insert, PredecessorOfOld, New});
+ DTUpdates.push_back({DominatorTree::Delete, PredecessorOfOld, Old});
+ }
DTU->applyUpdates(DTUpdates);
@@ -910,13 +914,14 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
} else {
// Split block expects NewBB to have a non-empty set of predecessors.
SmallVector<DominatorTree::UpdateType, 8> Updates;
- SmallPtrSet<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end());
+ SmallPtrSet<BasicBlock *, 8> UniquePreds;
Updates.push_back({DominatorTree::Insert, NewBB, OldBB});
- Updates.reserve(Updates.size() + 2 * UniquePreds.size());
- for (auto *UniquePred : UniquePreds) {
- Updates.push_back({DominatorTree::Insert, UniquePred, NewBB});
- Updates.push_back({DominatorTree::Delete, UniquePred, OldBB});
- }
+ Updates.reserve(Updates.size() + 2 * Preds.size());
+ for (auto *Pred : Preds)
+ if (UniquePreds.insert(Pred).second) {
+ Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+ Updates.push_back({DominatorTree::Delete, Pred, OldBB});
+ }
DTU->applyUpdates(Updates);
}
} else if (DT) {
@@ -1376,14 +1381,14 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
BasicBlock *Head = SplitBefore->getParent();
BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
if (DTU) {
- SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
- succ_end(Tail));
+ SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead;
Updates.push_back({DominatorTree::Insert, Head, Tail});
- Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size());
- for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) {
- Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead});
- Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
- }
+ Updates.reserve(Updates.size() + 2 * succ_size(Tail));
+ for (BasicBlock *SuccessorOfHead : successors(Tail))
+ if (UniqueSuccessorsOfHead.insert(SuccessorOfHead).second) {
+ Updates.push_back({DominatorTree::Insert, Tail, SuccessorOfHead});
+ Updates.push_back({DominatorTree::Delete, Head, SuccessorOfHead});
+ }
}
Instruction *HeadOldTerm = Head->getTerminator();
LLVMContext &C = Head->getContext();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 957935398972..580cfd80141e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -452,18 +452,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
return Changed;
case LibFunc_mempcpy:
case LibFunc_memccpy:
+ Changed |= setWillReturn(F);
+ LLVM_FALLTHROUGH;
+ case LibFunc_memcpy_chk:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setWillReturn(F);
Changed |= setDoesNotAlias(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
Changed |= setDoesNotAlias(F, 1);
Changed |= setDoesNotCapture(F, 1);
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
- case LibFunc_memcpy_chk:
- Changed |= setDoesNotThrow(F);
- return Changed;
case LibFunc_memalign:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
@@ -1018,9 +1017,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
Changed |= setDoesNotCapture(F, 1);
return Changed;
- // TODO: add LibFunc entries for:
- // case LibFunc_memset_pattern4:
- // case LibFunc_memset_pattern8:
+ case LibFunc_memset_pattern4:
+ case LibFunc_memset_pattern8:
case LibFunc_memset_pattern16:
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setDoesNotCapture(F, 0);
@@ -1029,10 +1027,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
case LibFunc_memset:
- Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotThrow(F);
+ LLVM_FALLTHROUGH;
+ case LibFunc_memset_chk:
+ Changed |= setOnlyAccessesArgMemory(F);
Changed |= setOnlyWritesMemory(F, 0);
+ Changed |= setDoesNotThrow(F);
return Changed;
// int __nvvm_reflect(const char *)
case LibFunc_nvvm_reflect:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
index 200deca4b317..57c273a0e3c5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -135,10 +135,18 @@ std::unique_ptr<Module> llvm::CloneModule(
// Similarly, copy over function bodies now...
//
for (const Function &I : M) {
- if (I.isDeclaration())
+ Function *F = cast<Function>(VMap[&I]);
+
+ if (I.isDeclaration()) {
+ // Copy over metadata for declarations since we're not doing it below in
+ // CloneFunctionInto().
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ I.getAllMetadata(MDs);
+ for (auto MD : MDs)
+ F->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
continue;
+ }
- Function *F = cast<Function>(VMap[&I]);
if (!ShouldCloneDefinition(&I)) {
// Skip after setting the correct linkage for an external reference.
F->setLinkage(GlobalValue::ExternalLinkage);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 4dbcbf80d3da..7c310f16d46e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -74,7 +74,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
{}, {}, nullptr, "widenable_cond");
CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
"exiplicit_guard_cond"));
- assert(isWidenableBranch(CheckBI) && "sanity check");
+ assert(isWidenableBranch(CheckBI) && "Branch must be widenable.");
}
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f4776589910f..997667810580 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1218,10 +1218,9 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
if (!RI || !isa<CallBase>(RI->getOperand(0)))
continue;
auto *RetVal = cast<CallBase>(RI->getOperand(0));
- // Sanity check that the cloned RetVal exists and is a call, otherwise we
- // cannot add the attributes on the cloned RetVal.
- // Simplification during inlining could have transformed the cloned
- // instruction.
+ // Check that the cloned RetVal exists and is a call, otherwise we cannot
+ // add the attributes on the cloned RetVal. Simplification during inlining
+ // could have transformed the cloned instruction.
auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
if (!NewRetVal)
continue;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
index 74ab37fadf36..ec926b1f5a94 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
@@ -529,8 +529,8 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
std::function<void(Value *)> AboutToDeleteCallback) {
unsigned S = 0, E = DeadInsts.size(), Alive = 0;
for (; S != E; ++S) {
- auto *I = cast<Instruction>(DeadInsts[S]);
- if (!isInstructionTriviallyDead(I)) {
+ auto *I = dyn_cast<Instruction>(DeadInsts[S]);
+ if (!I || !isInstructionTriviallyDead(I)) {
DeadInsts[S] = nullptr;
++Alive;
}
@@ -760,15 +760,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
SmallVector<DominatorTree::UpdateType, 32> Updates;
if (DTU) {
- SmallPtrSet<BasicBlock *, 2> PredsOfPredBB(pred_begin(PredBB),
- pred_end(PredBB));
- Updates.reserve(Updates.size() + 2 * PredsOfPredBB.size() + 1);
- for (BasicBlock *PredOfPredBB : PredsOfPredBB)
+ // To avoid processing the same predecessor more than once.
+ SmallPtrSet<BasicBlock *, 2> SeenPreds;
+ Updates.reserve(Updates.size() + 2 * pred_size(PredBB) + 1);
+ for (BasicBlock *PredOfPredBB : predecessors(PredBB))
// This predecessor of PredBB may already have DestBB as a successor.
if (PredOfPredBB != PredBB)
- Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
- for (BasicBlock *PredOfPredBB : PredsOfPredBB)
- Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
+ if (SeenPreds.insert(PredOfPredBB).second)
+ Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
+ SeenPreds.clear();
+ for (BasicBlock *PredOfPredBB : predecessors(PredBB))
+ if (SeenPreds.insert(PredOfPredBB).second)
+ Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
}
@@ -1096,16 +1099,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
SmallVector<DominatorTree::UpdateType, 32> Updates;
if (DTU) {
+ // To avoid processing the same predecessor more than once.
+ SmallPtrSet<BasicBlock *, 8> SeenPreds;
// All predecessors of BB will be moved to Succ.
- SmallPtrSet<BasicBlock *, 8> PredsOfBB(pred_begin(BB), pred_end(BB));
SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ));
- Updates.reserve(Updates.size() + 2 * PredsOfBB.size() + 1);
- for (auto *PredOfBB : PredsOfBB)
+ Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1);
+ for (auto *PredOfBB : predecessors(BB))
// This predecessor of BB may already have Succ as a successor.
if (!PredsOfSucc.contains(PredOfBB))
- Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
- for (auto *PredOfBB : PredsOfBB)
- Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
+ if (SeenPreds.insert(PredOfBB).second)
+ Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
+ SeenPreds.clear();
+ for (auto *PredOfBB : predecessors(BB))
+ if (SeenPreds.insert(PredOfBB).second)
+ Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
Updates.push_back({DominatorTree::Delete, BB, Succ});
}
@@ -2190,26 +2197,6 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
}
-void llvm::createUnreachableSwitchDefault(SwitchInst *Switch,
- DomTreeUpdater *DTU) {
- LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
- auto *BB = Switch->getParent();
- auto *OrigDefaultBlock = Switch->getDefaultDest();
- OrigDefaultBlock->removePredecessor(BB);
- BasicBlock *NewDefaultBlock = BasicBlock::Create(
- BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
- OrigDefaultBlock);
- new UnreachableInst(Switch->getContext(), NewDefaultBlock);
- Switch->setDefaultDest(&*NewDefaultBlock);
- if (DTU) {
- SmallVector<DominatorTree::UpdateType, 2> Updates;
- Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
- if (!is_contained(successors(BB), OrigDefaultBlock))
- Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
- DTU->applyUpdates(Updates);
- }
-}
-
BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
BasicBlock *UnwindEdge,
DomTreeUpdater *DTU) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index a92cb6a313d3..bb719a499a4c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -623,15 +623,13 @@ bool llvm::UnrollRuntimeLoopRemainder(
if (!SE)
return false;
- // Only unroll loops with a computable trip count, and the trip count needs
- // to be an int value (allowing a pointer type is a TODO item).
+ // Only unroll loops with a computable trip count.
// We calculate the backedge count by using getExitCount on the Latch block,
// which is proven to be the only exiting block in this loop. This is same as
// calculating getBackedgeTakenCount on the loop (which computes SCEV for all
// exiting blocks).
const SCEV *BECountSC = SE->getExitCount(L, Latch);
- if (isa<SCEVCouldNotCompute>(BECountSC) ||
- !BECountSC->getType()->isIntegerTy()) {
+ if (isa<SCEVCouldNotCompute>(BECountSC)) {
LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 68572d479742..c8e42acdffb3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
return Builder.CreateOrReduce(Src);
case RecurKind::Xor:
return Builder.CreateXorReduce(Src);
+ case RecurKind::FMulAdd:
case RecurKind::FAdd:
return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
Src);
@@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
Value *llvm::createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc,
Value *Src, Value *Start) {
- assert(Desc.getRecurrenceKind() == RecurKind::FAdd &&
+ assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+ Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
"Unexpected reduction kind");
assert(Src->getType()->isVectorTy() && "Expected a vector type");
assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 5893ce15b129..7d9992176658 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -446,6 +446,9 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
// Now that everything is rewritten, delete the old instructions from the
// function. They should all be dead now.
for (Instruction *User : Insts) {
+ if (!shouldDelete(User))
+ continue;
+
// If this is a load that still has uses, then the load must have been added
// as a live value in the SSAUpdate data structure for a block (e.g. because
// the loaded value was stored later). In this case, we need to recursively
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
new file mode 100644
index 000000000000..9495e442e0bf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -0,0 +1,462 @@
+//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a profile inference algorithm. Given an incomplete and
+// possibly imprecise block counts, the algorithm reconstructs realistic block
+// and edge counts that satisfy flow conservation rules, while minimally modify
+// input block counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+#define DEBUG_TYPE "sample-profile-inference"
+
+namespace {
+
+/// A value indicating an infinite flow/capacity/weight of a block/edge.
+/// Not using numeric_limits<int64_t>::max(), as the values can be summed up
+/// during the execution.
+static constexpr int64_t INF = ((int64_t)1) << 50;
+
+/// The minimum-cost maximum flow algorithm.
+///
+/// The algorithm finds the maximum flow of minimum cost on a given (directed)
+/// network using a modified version of the classical Moore-Bellman-Ford
+/// approach. The algorithm applies a number of augmentation iterations in which
+/// flow is sent along paths of positive capacity from the source to the sink.
+/// The worst-case time complexity of the implementation is O(v(f)*m*n), where
+/// where m is the number of edges, n is the number of vertices, and v(f) is the
+/// value of the maximum flow. However, the observed running time on typical
+/// instances is sub-quadratic, that is, o(n^2).
+///
+/// The input is a set of edges with specified costs and capacities, and a pair
+/// of nodes (source and sink). The output is the flow along each edge of the
+/// minimum total cost respecting the given edge capacities.
+class MinCostMaxFlow {
+public:
+ // Initialize algorithm's data structures for a network of a given size.
+ void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
+ Source = SourceNode;
+ Target = SinkNode;
+
+ Nodes = std::vector<Node>(NodeCount);
+ Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+ }
+
+ // Run the algorithm.
+ int64_t run() {
+ // Find an augmenting path and update the flow along the path
+ size_t AugmentationIters = 0;
+ while (findAugmentingPath()) {
+ augmentFlowAlongPath();
+ AugmentationIters++;
+ }
+
+ // Compute the total flow and its cost
+ int64_t TotalCost = 0;
+ int64_t TotalFlow = 0;
+ for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
+ for (auto &Edge : Edges[Src]) {
+ if (Edge.Flow > 0) {
+ TotalCost += Edge.Cost * Edge.Flow;
+ if (Src == Source)
+ TotalFlow += Edge.Flow;
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
+ << " iterations with " << TotalFlow << " total flow"
+ << " of " << TotalCost << " cost\n");
+ (void)TotalFlow;
+ return TotalCost;
+ }
+
+ /// Adding an edge to the network with a specified capacity and a cost.
+ /// Multiple edges between a pair of nodes are allowed but self-edges
+ /// are not supported.
+ void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
+ assert(Capacity > 0 && "adding an edge of zero capacity");
+ assert(Src != Dst && "loop edge are not supported");
+
+ Edge SrcEdge;
+ SrcEdge.Dst = Dst;
+ SrcEdge.Cost = Cost;
+ SrcEdge.Capacity = Capacity;
+ SrcEdge.Flow = 0;
+ SrcEdge.RevEdgeIndex = Edges[Dst].size();
+
+ Edge DstEdge;
+ DstEdge.Dst = Src;
+ DstEdge.Cost = -Cost;
+ DstEdge.Capacity = 0;
+ DstEdge.Flow = 0;
+ DstEdge.RevEdgeIndex = Edges[Src].size();
+
+ Edges[Src].push_back(SrcEdge);
+ Edges[Dst].push_back(DstEdge);
+ }
+
+ /// Adding an edge to the network of infinite capacity and a given cost.
+ void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
+ addEdge(Src, Dst, INF, Cost);
+ }
+
+ /// Get the total flow from a given source node.
+ /// Returns a list of pairs (target node, amount of flow to the target).
+ const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
+ std::vector<std::pair<uint64_t, int64_t>> Flow;
+ for (auto &Edge : Edges[Src]) {
+ if (Edge.Flow > 0)
+ Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
+ }
+ return Flow;
+ }
+
+ /// Get the total flow between a pair of nodes.
+ int64_t getFlow(uint64_t Src, uint64_t Dst) const {
+ int64_t Flow = 0;
+ for (auto &Edge : Edges[Src]) {
+ if (Edge.Dst == Dst) {
+ Flow += Edge.Flow;
+ }
+ }
+ return Flow;
+ }
+
+ /// A cost of increasing a block's count by one.
+ static constexpr int64_t AuxCostInc = 10;
+ /// A cost of decreasing a block's count by one.
+ static constexpr int64_t AuxCostDec = 20;
+ /// A cost of increasing a count of zero-weight block by one.
+ static constexpr int64_t AuxCostIncZero = 11;
+ /// A cost of increasing the entry block's count by one.
+ static constexpr int64_t AuxCostIncEntry = 40;
+ /// A cost of decreasing the entry block's count by one.
+ static constexpr int64_t AuxCostDecEntry = 10;
+ /// A cost of taking an unlikely jump.
+ static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+
+private:
+ /// Check for existence of an augmenting path with a positive capacity.
+ bool findAugmentingPath() {
+ // Initialize data structures
+ for (auto &Node : Nodes) {
+ Node.Distance = INF;
+ Node.ParentNode = uint64_t(-1);
+ Node.ParentEdgeIndex = uint64_t(-1);
+ Node.Taken = false;
+ }
+
+ std::queue<uint64_t> Queue;
+ Queue.push(Source);
+ Nodes[Source].Distance = 0;
+ Nodes[Source].Taken = true;
+ while (!Queue.empty()) {
+ uint64_t Src = Queue.front();
+ Queue.pop();
+ Nodes[Src].Taken = false;
+ // Although the residual network contains edges with negative costs
+ // (in particular, backward edges), it can be shown that there are no
+ // negative-weight cycles and the following two invariants are maintained:
+ // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
+ // where Dist is the length of the shortest path between two nodes. This
+ // allows to prune the search-space of the path-finding algorithm using
+ // the following early-stop criteria:
+ // -- If we find a path with zero-distance from Source to Target, stop the
+ // search, as the path is the shortest since Dist[Source, Target] >= 0;
+ // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
+ // process node V, as it is guaranteed _not_ to be on a shortest path
+ // from Source to Target; it follows from inequalities
+ // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
+ // >= Dist[Source, V]
+ if (Nodes[Target].Distance == 0)
+ break;
+ if (Nodes[Src].Distance > Nodes[Target].Distance)
+ continue;
+
+ // Process adjacent edges
+ for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
+ auto &Edge = Edges[Src][EdgeIdx];
+ if (Edge.Flow < Edge.Capacity) {
+ uint64_t Dst = Edge.Dst;
+ int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
+ if (Nodes[Dst].Distance > NewDistance) {
+ // Update the distance and the parent node/edge
+ Nodes[Dst].Distance = NewDistance;
+ Nodes[Dst].ParentNode = Src;
+ Nodes[Dst].ParentEdgeIndex = EdgeIdx;
+ // Add the node to the queue, if it is not there yet
+ if (!Nodes[Dst].Taken) {
+ Queue.push(Dst);
+ Nodes[Dst].Taken = true;
+ }
+ }
+ }
+ }
+ }
+
+ return Nodes[Target].Distance != INF;
+ }
+
+ /// Update the current flow along the augmenting path.
+ void augmentFlowAlongPath() {
+ // Find path capacity
+ int64_t PathCapacity = INF;
+ uint64_t Now = Target;
+ while (Now != Source) {
+ uint64_t Pred = Nodes[Now].ParentNode;
+ auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+ PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
+ Now = Pred;
+ }
+
+ assert(PathCapacity > 0 && "found incorrect augmenting path");
+
+ // Update the flow along the path
+ Now = Target;
+ while (Now != Source) {
+ uint64_t Pred = Nodes[Now].ParentNode;
+ auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+ auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
+
+ Edge.Flow += PathCapacity;
+ RevEdge.Flow -= PathCapacity;
+
+ Now = Pred;
+ }
+ }
+
+ /// An node in a flow network.
+ struct Node {
+ /// The cost of the cheapest path from the source to the current node.
+ int64_t Distance;
+ /// The node preceding the current one in the path.
+ uint64_t ParentNode;
+ /// The index of the edge between ParentNode and the current node.
+ uint64_t ParentEdgeIndex;
+ /// An indicator of whether the current node is in a queue.
+ bool Taken;
+ };
+ /// An edge in a flow network.
+ struct Edge {
+ /// The cost of the edge.
+ int64_t Cost;
+ /// The capacity of the edge.
+ int64_t Capacity;
+ /// The current flow on the edge.
+ int64_t Flow;
+ /// The destination node of the edge.
+ uint64_t Dst;
+ /// The index of the reverse edge between Dst and the current node.
+ uint64_t RevEdgeIndex;
+ };
+
+ /// The set of network nodes.
+ std::vector<Node> Nodes;
+ /// The set of network edges.
+ std::vector<std::vector<Edge>> Edges;
+ /// Source node of the flow.
+ uint64_t Source;
+ /// Target (sink) node of the flow.
+ uint64_t Target;
+};
+
+/// Initializing flow network for a given function.
+///
+/// Every block is split into three nodes that are responsible for (i) an
+/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
+/// reduction of the block weight.
+void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
+ uint64_t NumBlocks = Func.Blocks.size();
+ assert(NumBlocks > 1 && "Too few blocks in a function");
+ LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
+
+ // Pre-process data: make sure the entry weight is at least 1
+ if (Func.Blocks[Func.Entry].Weight == 0) {
+ Func.Blocks[Func.Entry].Weight = 1;
+ }
+ // Introducing dummy source/sink pairs to allow flow circulation.
+ // The nodes corresponding to blocks of Func have indicies in the range
+ // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
+ uint64_t S = 3 * NumBlocks;
+ uint64_t T = S + 1;
+ uint64_t S1 = S + 2;
+ uint64_t T1 = S + 3;
+
+ Network.initialize(3 * NumBlocks + 4, S1, T1);
+
+ // Create three nodes for every block of the function
+ for (uint64_t B = 0; B < NumBlocks; B++) {
+ auto &Block = Func.Blocks[B];
+ assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
+ "non-zero weight of a block w/o weight except for an entry");
+
+ // Split every block into two nodes
+ uint64_t Bin = 3 * B;
+ uint64_t Bout = 3 * B + 1;
+ uint64_t Baux = 3 * B + 2;
+ if (Block.Weight > 0) {
+ Network.addEdge(S1, Bout, Block.Weight, 0);
+ Network.addEdge(Bin, T1, Block.Weight, 0);
+ }
+
+ // Edges from S and to T
+ assert((!Block.isEntry() || !Block.isExit()) &&
+ "a block cannot be an entry and an exit");
+ if (Block.isEntry()) {
+ Network.addEdge(S, Bin, 0);
+ } else if (Block.isExit()) {
+ Network.addEdge(Bout, T, 0);
+ }
+
+ // An auxiliary node to allow increase/reduction of block counts:
+ // We assume that decreasing block counts is more expensive than increasing,
+ // and thus, setting separate costs here. In the future we may want to tune
+ // the relative costs so as to maximize the quality of generated profiles.
+ int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
+ int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
+ if (Block.UnknownWeight) {
+ // Do not penalize changing weights of blocks w/o known profile count
+ AuxCostInc = 0;
+ AuxCostDec = 0;
+ } else {
+ // Increasing the count for "cold" blocks with zero initial count is more
+ // expensive than for "hot" ones
+ if (Block.Weight == 0) {
+ AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
+ }
+ // Modifying the count of the entry block is expensive
+ if (Block.isEntry()) {
+ AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
+ AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
+ }
+ }
+ // For blocks with self-edges, do not penalize a reduction of the count,
+ // as all of the increase can be attributed to the self-edge
+ if (Block.HasSelfEdge) {
+ AuxCostDec = 0;
+ }
+
+ Network.addEdge(Bin, Baux, AuxCostInc);
+ Network.addEdge(Baux, Bout, AuxCostInc);
+ if (Block.Weight > 0) {
+ Network.addEdge(Bout, Baux, AuxCostDec);
+ Network.addEdge(Baux, Bin, AuxCostDec);
+ }
+ }
+
+ // Creating edges for every jump
+ for (auto &Jump : Func.Jumps) {
+ uint64_t Src = Jump.Source;
+ uint64_t Dst = Jump.Target;
+ if (Src != Dst) {
+ uint64_t SrcOut = 3 * Src + 1;
+ uint64_t DstIn = 3 * Dst;
+ uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
+ Network.addEdge(SrcOut, DstIn, Cost);
+ }
+ }
+
+ // Make sure we have a valid flow circulation
+ Network.addEdge(T, S, 0);
+}
+
+/// Extract resulting block and edge counts from the flow network.
+void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
+ uint64_t NumBlocks = Func.Blocks.size();
+
+ // Extract resulting block counts
+ for (uint64_t Src = 0; Src < NumBlocks; Src++) {
+ auto &Block = Func.Blocks[Src];
+ uint64_t SrcOut = 3 * Src + 1;
+ int64_t Flow = 0;
+ for (auto &Adj : Network.getFlow(SrcOut)) {
+ uint64_t DstIn = Adj.first;
+ int64_t DstFlow = Adj.second;
+ bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
+ if (!IsAuxNode || Block.HasSelfEdge) {
+ Flow += DstFlow;
+ }
+ }
+ Block.Flow = Flow;
+ assert(Flow >= 0 && "negative block flow");
+ }
+
+ // Extract resulting jump counts
+ for (auto &Jump : Func.Jumps) {
+ uint64_t Src = Jump.Source;
+ uint64_t Dst = Jump.Target;
+ int64_t Flow = 0;
+ if (Src != Dst) {
+ uint64_t SrcOut = 3 * Src + 1;
+ uint64_t DstIn = 3 * Dst;
+ Flow = Network.getFlow(SrcOut, DstIn);
+ } else {
+ uint64_t SrcOut = 3 * Src + 1;
+ uint64_t SrcAux = 3 * Src + 2;
+ int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
+ if (AuxFlow > 0)
+ Flow = AuxFlow;
+ }
+ Jump.Flow = Flow;
+ assert(Flow >= 0 && "negative jump flow");
+ }
+}
+
+#ifndef NDEBUG
+/// Verify that the computed flow values satisfy flow conservation rules
+void verifyWeights(const FlowFunction &Func) {
+ const uint64_t NumBlocks = Func.Blocks.size();
+ auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
+ auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
+ for (auto &Jump : Func.Jumps) {
+ InFlow[Jump.Target] += Jump.Flow;
+ OutFlow[Jump.Source] += Jump.Flow;
+ }
+
+ uint64_t TotalInFlow = 0;
+ uint64_t TotalOutFlow = 0;
+ for (uint64_t I = 0; I < NumBlocks; I++) {
+ auto &Block = Func.Blocks[I];
+ if (Block.isEntry()) {
+ TotalInFlow += Block.Flow;
+ assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+ } else if (Block.isExit()) {
+ TotalOutFlow += Block.Flow;
+ assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+ } else {
+ assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+ assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+ }
+ }
+ assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+}
+#endif
+
+} // end of anonymous namespace
+
+/// Apply the profile inference algorithm for a given flow function
+void llvm::applyFlowInference(FlowFunction &Func) {
+ // Create and apply an inference network model
+ auto InferenceNetwork = MinCostMaxFlow();
+ initializeNetwork(InferenceNetwork, Func);
+ InferenceNetwork.run();
+
+ // Extract flow values for every block and every edge
+ extractWeights(InferenceNetwork, Func);
+
+#ifndef NDEBUG
+ // Verify the result
+ verifyWeights(Func);
+#endif
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
index 6d995cf4c048..ea0e8343eb88 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused(
cl::desc("Use this option to turn off/on warnings about function with "
"samples but without debug information to use those samples. "));
+cl::opt<bool> SampleProfileUseProfi(
+ "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Use profi to infer block and edge counts."));
+
namespace sampleprofutil {
/// Return true if the given callsite is hot wrt to hot cutoff threshold.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index a042146d7ace..71c15d5c51fc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -1833,22 +1834,6 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
return V;
}
-/// Check whether value has nuw/nsw/exact set but SCEV does not.
-/// TODO: In reality it is better to check the poison recursively
-/// but this is better than nothing.
-static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) {
- if (isa<OverflowingBinaryOperator>(I)) {
- if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
- if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
- return true;
- if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
- return true;
- }
- } else if (isa<PossiblyExactOperator>(I) && I->isExact())
- return true;
- return false;
-}
-
ScalarEvolution::ValueOffsetPair
SCEVExpander::FindValueInExprValueMap(const SCEV *S,
const Instruction *InsertPt) {
@@ -1872,8 +1857,7 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S,
if (S->getType() == V->getType() &&
SE.DT.dominates(EntInst, InsertPt) &&
(SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
- SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) &&
- !SCEVLostPoisonFlags(S, EntInst))
+ SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
return {V, Offset};
}
}
@@ -1952,26 +1936,36 @@ Value *SCEVExpander::expand(const SCEV *S) {
if (!V)
V = visit(S);
- else if (VO.second) {
- if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
- Type *Ety = Vty->getPointerElementType();
- int64_t Offset = VO.second->getSExtValue();
- int64_t ESize = SE.getTypeSizeInBits(Ety);
- if ((Offset * 8) % ESize == 0) {
- ConstantInt *Idx =
+ else {
+ // If we're reusing an existing instruction, we are effectively CSEing two
+ // copies of the instruction (with potentially different flags). As such,
+ // we need to drop any poison generating flags unless we can prove that
+ // said flags must be valid for all new users.
+ if (auto *I = dyn_cast<Instruction>(V))
+ if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))
+ I->dropPoisonGeneratingFlags();
+
+ if (VO.second) {
+ if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
+ Type *Ety = Vty->getPointerElementType();
+ int64_t Offset = VO.second->getSExtValue();
+ int64_t ESize = SE.getTypeSizeInBits(Ety);
+ if ((Offset * 8) % ESize == 0) {
+ ConstantInt *Idx =
ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
- V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
- } else {
- ConstantInt *Idx =
+ V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
+ } else {
+ ConstantInt *Idx =
ConstantInt::getSigned(VO.second->getType(), -Offset);
- unsigned AS = Vty->getAddressSpace();
- V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
- V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
- "uglygep");
- V = Builder.CreateBitCast(V, Vty);
+ unsigned AS = Vty->getAddressSpace();
+ V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+ V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+ "uglygep");
+ V = Builder.CreateBitCast(V, Vty);
+ }
+ } else {
+ V = Builder.CreateSub(V, VO.second);
}
- } else {
- V = Builder.CreateSub(V, VO.second);
}
}
// Remember the expanded value for this SCEV at this location.
@@ -2180,7 +2174,9 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
}
// Use expand's logic which is used for reusing a previous Value in
- // ExprValueMap.
+ // ExprValueMap. Note that we don't currently model the cost of
+ // needing to drop poison generating flags on the instruction if we
+ // want to reuse it. We effectively assume that has zero cost.
ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
if (VO.first)
return VO;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f467de5f924e..afa3ecde77f9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3936,7 +3936,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
BasicBlock *KeepEdge1 = TrueBB;
BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
- SmallPtrSet<BasicBlock *, 2> RemovedSuccessors;
+ SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
// Then remove the rest.
for (BasicBlock *Succ : successors(OldTerm)) {
@@ -4782,6 +4782,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
return true;
}
+static void createUnreachableSwitchDefault(SwitchInst *Switch,
+ DomTreeUpdater *DTU) {
+ LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+ auto *BB = Switch->getParent();
+ auto *OrigDefaultBlock = Switch->getDefaultDest();
+ OrigDefaultBlock->removePredecessor(BB);
+ BasicBlock *NewDefaultBlock = BasicBlock::Create(
+ BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
+ OrigDefaultBlock);
+ new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+ Switch->setDefaultDest(&*NewDefaultBlock);
+ if (DTU) {
+ SmallVector<DominatorTree::UpdateType, 2> Updates;
+ Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
+ if (!is_contained(successors(BB), OrigDefaultBlock))
+ Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
+ DTU->applyUpdates(Updates);
+ }
+}
+
/// Turn a switch with two reachable destinations into an integer range
/// comparison and branch.
bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
@@ -4927,10 +4947,14 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
// Gather dead cases.
SmallVector<ConstantInt *, 8> DeadCases;
SmallDenseMap<BasicBlock *, int, 8> NumPerSuccessorCases;
+ SmallVector<BasicBlock *, 8> UniqueSuccessors;
for (auto &Case : SI->cases()) {
auto *Successor = Case.getCaseSuccessor();
- if (DTU)
+ if (DTU) {
+ if (!NumPerSuccessorCases.count(Successor))
+ UniqueSuccessors.push_back(Successor);
++NumPerSuccessorCases[Successor];
+ }
const APInt &CaseVal = Case.getCaseValue()->getValue();
if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
(CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
@@ -4973,9 +4997,9 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
if (DTU) {
std::vector<DominatorTree::UpdateType> Updates;
- for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
- if (I.second == 0)
- Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first});
+ for (auto *Successor : UniqueSuccessors)
+ if (NumPerSuccessorCases[Successor] == 0)
+ Updates.push_back({DominatorTree::Delete, SI->getParent(), Successor});
DTU->applyUpdates(Updates);
}
@@ -6040,15 +6064,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
if (Succ == SI->getDefaultDest())
continue;
Succ->removePredecessor(BB);
- RemovedSuccessors.insert(Succ);
+ if (DTU && RemovedSuccessors.insert(Succ).second)
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
}
SI->eraseFromParent();
- if (DTU) {
- for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
- Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+ if (DTU)
DTU->applyUpdates(Updates);
- }
++NumLookupTables;
if (NeedMask)
@@ -6215,7 +6237,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
// Eliminate redundant destinations.
SmallPtrSet<Value *, 8> Succs;
- SmallPtrSet<BasicBlock *, 8> RemovedSuccs;
+ SmallSetVector<BasicBlock *, 8> RemovedSuccs;
for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
BasicBlock *Dest = IBI->getDestination(i);
if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
@@ -6305,8 +6327,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
// We've found an identical block. Update our predecessors to take that
// path instead and make ourselves dead.
- SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
- for (BasicBlock *Pred : Preds) {
+ SmallSetVector<BasicBlock *, 16> UniquePreds(pred_begin(BB), pred_end(BB));
+ for (BasicBlock *Pred : UniquePreds) {
InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
"unexpected successor");
@@ -6323,8 +6345,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
if (isa<DbgInfoIntrinsic>(Inst))
Inst.eraseFromParent();
- SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB));
- for (BasicBlock *Succ : Succs) {
+ SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
+ for (BasicBlock *Succ : UniqueSuccs) {
Succ->removePredecessor(BB);
if (DTU)
Updates.push_back({DominatorTree::Delete, BB, Succ});
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23bb6f0860c9..5ca0adb4242c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -473,18 +473,10 @@ public:
/// handle the more complex control flow around the loops.
virtual BasicBlock *createVectorizedLoopSkeleton();
- /// Widen a single instruction within the innermost loop.
- void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
- VPTransformState &State);
-
/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);
- /// Widen a single select instruction within the innermost loop.
- void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
- bool InvariantCond, VPTransformState &State);
-
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State);
@@ -496,12 +488,6 @@ public:
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
- /// Vectorize a single GetElementPtrInst based on information gathered and
- /// decisions taken during planning.
- void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
- unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
-
/// Vectorize a single first-order recurrence or pointer induction PHINode in
/// a block. This method handles the induction variable canonicalization. It
/// supports both VF = 1 for unrolled loops and arbitrary length vectors.
@@ -511,9 +497,9 @@ public:
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
- /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+ /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
/// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
+ void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
@@ -538,15 +524,6 @@ public:
ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
- /// Vectorize Load and Store instructions with the base address given in \p
- /// Addr, optionally masking the vector operations if \p BlockInMask is
- /// non-null. Use \p State to translate given VPValues to IR values in the
- /// vectorized loop.
- void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
- VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask,
- bool ConsecutiveStride, bool Reverse);
-
/// Set the debug location in the builder \p Ptr using the debug location in
/// \p V. If \p Ptr is None then it uses the class member's Builder.
void setDebugLocFromInst(const Value *V,
@@ -566,6 +543,17 @@ public:
/// element.
virtual Value *getBroadcastInstrs(Value *V);
+ /// Add metadata from one instruction to another.
+ ///
+ /// This includes both the original MDs from \p From and additional ones (\see
+ /// addNewMetadata). Use this for *newly created* instructions in the vector
+ /// loop.
+ void addMetadata(Instruction *To, Instruction *From);
+
+ /// Similar to the previous function but it adds the metadata to a
+ /// vector of instructions.
+ void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
protected:
friend class LoopVectorizationPlanner;
@@ -741,16 +729,16 @@ protected:
/// vector loop.
void addNewMetadata(Instruction *To, const Instruction *Orig);
- /// Add metadata from one instruction to another.
- ///
- /// This includes both the original MDs from \p From and additional ones (\see
- /// addNewMetadata). Use this for *newly created* instructions in the vector
- /// loop.
- void addMetadata(Instruction *To, Instruction *From);
-
- /// Similar to the previous function but it adds the metadata to a
- /// vector of instructions.
- void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ /// Collect poison-generating recipes that may generate a poison value that is
+ /// used after vectorization, even when their operands are not poison. Those
+ /// recipes meet the following conditions:
+ /// * Contribute to the address computation of a recipe generating a widen
+ /// memory load/store (VPWidenMemoryInstructionRecipe or
+ /// VPInterleaveRecipe).
+ /// * Such a widen memory load/store has at least one underlying Instruction
+ /// that is in a basic block that needs predication and after vectorization
+ /// the generated instruction won't be predicated.
+ void collectPoisonGeneratingRecipes(VPTransformState &State);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
@@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To,
LVer->annotateInstWithNoAlias(To, Orig);
}
+void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
+ VPTransformState &State) {
+
+ // Collect recipes in the backward slice of `Root` that may generate a poison
+ // value that is used after vectorization.
+ SmallPtrSet<VPRecipeBase *, 16> Visited;
+ auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
+ SmallVector<VPRecipeBase *, 16> Worklist;
+ Worklist.push_back(Root);
+
+ // Traverse the backward slice of Root through its use-def chain.
+ while (!Worklist.empty()) {
+ VPRecipeBase *CurRec = Worklist.back();
+ Worklist.pop_back();
+
+ if (!Visited.insert(CurRec).second)
+ continue;
+
+ // Prune search if we find another recipe generating a widen memory
+ // instruction. Widen memory instructions involved in address computation
+ // will lead to gather/scatter instructions, which don't need to be
+ // handled.
+ if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
+ isa<VPInterleaveRecipe>(CurRec))
+ continue;
+
+ // This recipe contributes to the address computation of a widen
+ // load/store. Collect recipe if its underlying instruction has
+ // poison-generating flags.
+ Instruction *Instr = CurRec->getUnderlyingInstr();
+ if (Instr && Instr->hasPoisonGeneratingFlags())
+ State.MayGeneratePoisonRecipes.insert(CurRec);
+
+ // Add new definitions to the worklist.
+ for (VPValue *operand : CurRec->operands())
+ if (VPDef *OpDef = operand->getDef())
+ Worklist.push_back(cast<VPRecipeBase>(OpDef));
+ }
+ });
+
+ // Traverse all the recipes in the VPlan and collect the poison-generating
+ // recipes in the backward slice starting at the address of a VPWidenRecipe or
+ // VPInterleaveRecipe.
+ auto Iter = depth_first(
+ VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+ for (VPRecipeBase &Recipe : *VPBB) {
+ if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
+ Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+ VPDef *AddrDef = WidenRec->getAddr()->getDef();
+ if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
+ Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+ collectPoisonGeneratingInstrsInBackwardSlice(
+ cast<VPRecipeBase>(AddrDef));
+ } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
+ VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
+ if (AddrDef) {
+ // Check if any member of the interleave group needs predication.
+ const InterleaveGroup<Instruction> *InterGroup =
+ InterleaveRec->getInterleaveGroup();
+ bool NeedPredication = false;
+ for (int I = 0, NumMembers = InterGroup->getNumMembers();
+ I < NumMembers; ++I) {
+ Instruction *Member = InterGroup->getMember(I);
+ if (Member)
+ NeedPredication |=
+ Legal->blockNeedsPredication(Member->getParent());
+ }
+
+ if (NeedPredication)
+ collectPoisonGeneratingInstrsInBackwardSlice(
+ cast<VPRecipeBase>(AddrDef));
+ }
+ }
+ }
+ }
+}
+
void InnerLoopVectorizer::addMetadata(Instruction *To,
Instruction *From) {
propagateMetadata(To, From);
@@ -1541,7 +1607,16 @@ public:
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I) {
+ bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+ // When we know the load is uniform and the original scalar loop was not
+ // predicated we don't need to mark it as a predicated instruction. Any
+ // vectorised blocks created when tail-folding are something artificial we
+ // have introduced and we know there is always at least one active lane.
+ // That's why we call Legal->blockNeedsPredication here because it doesn't
+ // query tail-folding.
+ if (IsKnownUniform && isa<LoadInst>(I) &&
+ !Legal->blockNeedsPredication(I->getParent()))
+ return false;
if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
// Loads and stores that need some form of masked operation are predicated
@@ -1816,9 +1891,11 @@ private:
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
- /// during vectorization. Non-uniform scalarized instructions will be
- /// represented by VF values in the vectorized loop, each corresponding to an
- /// iteration of the original scalar loop.
+ /// during vectorization. collectLoopScalars should only add non-uniform nodes
+ /// to the list if they are used by a load/store instruction that is marked as
+ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
+ /// VF values in the vectorized loop, each corresponding to an iteration of
+ /// the original scalar loop.
void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
@@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(
- Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
- bool Reverse) {
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(Instr);
- StoreInst *SI = dyn_cast<StoreInst>(Instr);
-
- assert((LI || SI) && "Invalid Load/Store instruction");
- assert((!SI || StoredValue) && "No stored value provided for widened store");
- assert((!LI || !StoredValue) && "Stored value provided for widened load");
-
- Type *ScalarDataTy = getLoadStoreType(Instr);
-
- auto *DataTy = VectorType::get(ScalarDataTy, VF);
- const Align Alignment = getLoadStoreAlignment(Instr);
- bool CreateGatherScatter = !ConsecutiveStride;
-
- VectorParts BlockInMaskParts(UF);
- bool isMaskRequired = BlockInMask;
- if (isMaskRequired)
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockInMaskParts[Part] = State.get(BlockInMask, Part);
-
- const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
- // Calculate the pointer for the specific unroll-part.
- GetElementPtrInst *PartPtr = nullptr;
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = gep->isInBounds();
- if (Reverse) {
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- // RunTimeVF = VScale * VF.getKnownMinValue()
- // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
- // NumElt = -Part * RunTimeVF
- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
- // LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
- PartPtr =
- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
- PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
- PartPtr->setIsInBounds(InBounds);
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
- } else {
- Value *Increment =
- createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
- PartPtr->setIsInBounds(InBounds);
- }
-
- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- };
-
- // Handle Stores:
- if (SI) {
- setDebugLocFromInst(SI);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Instruction *NewSI = nullptr;
- Value *StoredVal = State.get(StoredValue, Part);
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
- MaskPart);
- } else {
- if (Reverse) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = reverseVector(StoredVal);
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
- if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
- BlockInMaskParts[Part]);
- else
- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
- }
- addMetadata(NewSI, SI);
- }
- return;
- }
-
- // Handle loads.
- assert(LI && "Must have a load instruction");
- setDebugLocFromInst(LI);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *NewLI;
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
- nullptr, "wide.masked.gather");
- addMetadata(NewLI, LI);
- } else {
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
- if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(
- DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
- PoisonValue::get(DataTy), "wide.masked.load");
- else
- NewLI =
- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
-
- // Add metadata to the load, but setVectorValue to the reverse shuffle.
- addMetadata(NewLI, LI);
- if (Reverse)
- NewLI = reverseVector(NewLI);
- }
-
- State.set(Def, NewLI, Part);
- }
-}
-
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
- VPUser &User,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
bool IfPredicateInstr,
VPTransformState &State) {
@@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
+ // If the scalarized instruction contributes to the address computation of a
+ // widen masked load/store which was in a basic block that needed predication
+ // and is not predicated after vectorization, we can't propagate
+ // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
+ // instruction could feed a poison value to the base address of the widen
+ // load/store.
+ if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+ Cloned->dropPoisonGeneratingFlags();
+
State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+ for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
auto InputInstance = Instance;
if (!Operand || !OrigLoop->contains(Operand) ||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
InputInstance.Lane = VPLane::getFirstLane();
- auto *NewOp = State.get(User.getOperand(op), InputInstance);
+ auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- State.set(Def, Cloned, Instance);
+ State.set(RepRecipe, Cloned, Instance);
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
- VPUser &Operands, unsigned UF,
- ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant,
- VPTransformState &State) {
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
-
- if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- State.set(VPDef, EntryPart, Part);
- addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant
- ? State.get(Operands.getOperand(0), VPIteration(0, 0))
- : State.get(Operands.getOperand(0), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
- VPValue *Operand = Operands.getOperand(I);
- if (IsIndexLoopInvariant[I - 1])
- Indices.push_back(State.get(Operand, VPIteration(0, 0)));
- else
- Indices.push_back(State.get(Operand, Part));
- }
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP =
- GEP->isInBounds()
- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
- Indices)
- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
- State.set(VPDef, NewGEP, Part);
- addMetadata(NewGEP, GEP);
- }
- }
-}
-
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
VPWidenPHIRecipe *PhiR,
VPTransformState &State) {
@@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
- unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
-
- bool NeedsVectorIndex = !IsUniform && VF.isScalable();
- Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
- if (NeedsVectorIndex) {
- Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
- UnitStepVec = Builder.CreateStepVector(VecIVTy);
- PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
- }
+ assert((IsUniform || !State.VF.isScalable()) &&
+ "Cannot scalarize a scalable VF");
+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart =
createStepForVF(Builder, PtrInd->getType(), VF, Part);
- if (NeedsVectorIndex) {
- // Here we cache the whole vector, which means we can support the
- // extraction of any lane. However, in some cases the extractelement
- // instruction that is generated for scalar uses of this vector (e.g.
- // a load instruction) is not folded away. Therefore we still
- // calculate values for the first n lanes to avoid redundant moves
- // (when extracting the 0th element) and to produce scalar code (i.e.
- // additional add/gep instructions instead of expensive extractelement
- // instructions) when extracting higher-order elements.
- Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
- Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
- Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
- SclrGep->setName("next.gep");
- State.set(PhiR, SclrGep, Part);
- }
-
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
@@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
- VPUser &User,
- VPTransformState &State) {
- switch (I.getOpcode()) {
- case Instruction::Call:
- case Instruction::Br:
- case Instruction::PHI:
- case Instruction::GetElementPtr:
- case Instruction::Select:
- llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::SRem:
- case Instruction::URem:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::FNeg:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Just widen unops and binops.
- setDebugLocFromInst(&I);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 2> Ops;
- for (VPValue *VPOp : User.operands())
- Ops.push_back(State.get(VPOp, Part));
-
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
-
- if (auto *VecOp = dyn_cast<Instruction>(V))
- VecOp->copyIRFlags(&I);
-
- // Use this vector value for all users of the original instruction.
- State.set(Def, V, Part);
- addMetadata(V, &I);
- }
-
- break;
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- setDebugLocFromInst(Cmp);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *B = State.get(User.getOperand(1), Part);
- Value *C = nullptr;
- if (FCmp) {
- // Propagate fast math flags.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- } else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
- }
- State.set(Def, C, Part);
- addMetadata(C, &I);
- }
-
- break;
- }
-
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto *CI = cast<CastInst>(&I);
- setDebugLocFromInst(CI);
-
- /// Vectorize casts.
- Type *DestTy =
- (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- State.set(Def, Cast, Part);
- addMetadata(Cast, &I);
- }
- break;
- }
- default:
- // This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
- llvm_unreachable("Unhandled instruction!");
- } // end of switch.
-}
-
void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPUser &ArgOperands,
VPTransformState &State) {
@@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
}
}
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
- VPUser &Operands,
- bool InvariantCond,
- VPTransformState &State) {
- setDebugLocFromInst(&I);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- auto *InvarCond = InvariantCond
- ? State.get(Operands.getOperand(0), VPIteration(0, 0))
- : nullptr;
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *Cond =
- InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
- Value *Op0 = State.get(Operands.getOperand(1), Part);
- Value *Op1 = State.get(Operands.getOperand(2), Part);
- Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
- State.set(VPDef, Sel, Part);
- addMetadata(Sel, &I);
- }
-}
-
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
@@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
!TheLoop->isLoopInvariant(V);
};
- auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
- if (!isa<PHINode>(Ptr) ||
- !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
- return false;
- auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
- if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
- return false;
- return isScalarUse(MemAccess, Ptr);
- };
-
- // A helper that evaluates a memory access's use of a pointer. If the
- // pointer is actually the pointer induction of a loop, it is being
- // inserted into Worklist. If the use will be a scalar use, and the
- // pointer is only used by memory accesses, we place the pointer in
- // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
+ // A helper that evaluates a memory access's use of a pointer. If the use will
+ // be a scalar use and the pointer is only used by memory accesses, we place
+ // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+ // PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
- if (isScalarPtrInduction(MemAccess, Ptr)) {
- Worklist.insert(cast<Instruction>(Ptr));
- LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
- << "\n");
-
- Instruction *Update = cast<Instruction>(
- cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-
- // If there is more than one user of Update (Ptr), we shouldn't assume it
- // will be scalar after vectorisation as other users of the instruction
- // may require widening. Otherwise, add it to ScalarPtrs.
- if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
- ScalarPtrs.insert(Update);
- return;
- }
- }
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;
+ // Returns true if \p Indvar is a pointer induction that is used directly by
+ // load/store instruction \p I.
+ auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
+ Instruction *I) {
+ return Induction.second.getKind() ==
+ InductionDescriptor::IK_PtrInduction &&
+ (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
+ };
+
// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(Ind, I);
});
if (!ScalarInd)
continue;
@@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto ScalarIndUpdate =
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
});
if (!ScalarIndUpdate)
continue;
@@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+ // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
+ // that it is being called from this specific place.
// Figure out whether the access is strided and get the stride value
// if it's known in compile time
@@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
InstructionCost BaseCost = TTI.getArithmeticReductionCost(
RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
+ // normal fmul instruction to the cost of the fadd reduction.
+ if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
+ BaseCost +=
+ TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
+
// If we're using ordered reductions then we can just return the base cost
// here, since getArithmeticReductionCost calculates the full ordered
// reduction cost when FP reassociation is not allowed.
@@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ return *RedCost;
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
@@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
+ ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
DEBUG_WITH_TYPE(VerboseDebug, {
- dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+ dbgs() << "intermediate fn:\n"
+ << *OrigLoop->getHeader()->getParent() << "\n";
});
}
@@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
DEBUG_WITH_TYPE(VerboseDebug, {
- dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+ dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
});
}
@@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+ Range);
// Even if the instruction is not marked as uniform, there are certain
// intrinsic calls that can be effectively treated as such, so we check for
@@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (VPBB)
VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
else {
- Plan->setEntry(FirstVPBBForBB);
+ auto *TopRegion = new VPRegionBlock("vector loop");
+ TopRegion->setEntry(FirstVPBBForBB);
+ Plan->setEntry(TopRegion);
HeaderVPBB = FirstVPBBForBB;
}
VPBB = FirstVPBBForBB;
@@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- assert(isa<VPBasicBlock>(Plan->getEntry()) &&
+ assert(isa<VPRegionBlock>(Plan->getEntry()) &&
!Plan->getEntry()->getEntryBasicBlock()->empty() &&
- "entry block must be set to a non-empty VPBasicBlock");
+ "entry block must be set to a VPRegionBlock having a non-empty entry "
+ "VPBasicBlock");
+ cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
RecipeBuilder.fixHeaderPhis();
// ---------------------------------------------------------------------------
@@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned FirstOpId;
assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
"Only min/max recurrences allowed for inloop reductions");
+ // Recognize a call to the llvm.fmuladd intrinsic.
+ bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
+ assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
"Expected to replace a VPWidenSelectSC");
FirstOpId = 1;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
+ (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
"Expected to replace a VPWidenSC");
FirstOpId = 0;
}
@@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
auto *CondOp = CM.foldTailByMasking()
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
- VPReductionRecipe *RedRecipe = new VPReductionRecipe(
- &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+
+ if (IsFMulAdd) {
+ // If the instruction is a call to the llvm.fmuladd intrinsic then we
+ // need to create an fmul recipe to use as the vector operand for the
+ // fadd reduction.
+ VPInstruction *FMulRecipe = new VPInstruction(
+ Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
+ FMulRecipe->setFastMathFlags(R->getFastMathFlags());
+ WidenRecipe->getParent()->insert(FMulRecipe,
+ WidenRecipe->getIterator());
+ VecOp = FMulRecipe;
+ }
+ VPReductionRecipe *RedRecipe =
+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
@@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
}
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
- this, *this, InvariantCond, State);
+ auto &I = *cast<SelectInst>(getUnderlyingInstr());
+ State.ILV->setDebugLocFromInst(&I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ auto *InvarCond =
+ InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
+ Value *Op0 = State.get(getOperand(1), Part);
+ Value *Op1 = State.get(getOperand(2), Part);
+ Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
+ State.set(this, Sel, Part);
+ State.ILV->addMetadata(Sel, &I);
+ }
}
void VPWidenRecipe::execute(VPTransformState &State) {
- State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
+ auto &I = *cast<Instruction>(getUnderlyingValue());
+ auto &Builder = State.Builder;
+ switch (I.getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Br:
+ case Instruction::PHI:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ llvm_unreachable("This instruction is handled by a different recipe.");
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::FNeg:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen unops and binops.
+ State.ILV->setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ SmallVector<Value *, 2> Ops;
+ for (VPValue *VPOp : operands())
+ Ops.push_back(State.get(VPOp, Part));
+
+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+ if (auto *VecOp = dyn_cast<Instruction>(V)) {
+ VecOp->copyIRFlags(&I);
+
+ // If the instruction is vectorized and was in a basic block that needed
+ // predication, we can't propagate poison-generating flags (nuw/nsw,
+ // exact, etc.). The control flow has been linearized and the
+ // instruction is no longer guarded by the predicate, which could make
+ // the flag properties to no longer hold.
+ if (State.MayGeneratePoisonRecipes.count(this) > 0)
+ VecOp->dropPoisonGeneratingFlags();
+ }
+
+ // Use this vector value for all users of the original instruction.
+ State.set(this, V, Part);
+ State.ILV->addMetadata(V, &I);
+ }
+
+ break;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (I.getOpcode() == Instruction::FCmp);
+ auto *Cmp = cast<CmpInst>(&I);
+ State.ILV->setDebugLocFromInst(Cmp);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *C = nullptr;
+ if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ }
+ State.set(this, C, Part);
+ State.ILV->addMetadata(C, &I);
+ }
+
+ break;
+ }
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto *CI = cast<CastInst>(&I);
+ State.ILV->setDebugLocFromInst(CI);
+
+ /// Vectorize casts.
+ Type *DestTy = (State.VF.isScalar())
+ ? CI->getType()
+ : VectorType::get(CI->getType(), State.VF);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+ State.set(this, Cast, Part);
+ State.ILV->addMetadata(Cast, &I);
+ }
+ break;
+ }
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
- State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
- *this, State.UF, State.VF, IsPtrLoopInvariant,
- IsIndexLoopInvariant, State);
+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
+ if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = State.Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
+ State.set(this, EntryPart, Part);
+ State.ILV->addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant
+ ? State.get(getOperand(0), VPIteration(0, 0))
+ : State.get(getOperand(0), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+ VPValue *Operand = getOperand(I);
+ if (IsIndexLoopInvariant[I - 1])
+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));
+ else
+ Indices.push_back(State.get(Operand, Part));
+ }
+
+ // If the GEP instruction is vectorized and was in a basic block that
+ // needed predication, we can't propagate the poison-generating 'inbounds'
+ // flag. The control flow has been linearized and the GEP is no longer
+ // guarded by the predicate, which could make the 'inbounds' properties to
+ // no longer hold.
+ bool IsInBounds =
+ GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP = IsInBounds
+ ? State.Builder.CreateInBoundsGEP(
+ GEP->getSourceElementType(), Ptr, Indices)
+ : State.Builder.CreateGEP(GEP->getSourceElementType(),
+ Ptr, Indices);
+ assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ State.set(this, NewGEP, Part);
+ State.ILV->addMetadata(NewGEP, GEP);
+ }
+ }
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
@@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
- *State.Instance, IsPredicated, State);
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
+ IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from poison.
@@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
"Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
VPIteration(Part, Lane), IsPredicated,
State);
}
@@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
- State.ILV->vectorizeMemoryInstruction(
- &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
- StoredValue, getMask(), Consecutive, Reverse);
+
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
+ StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+ assert((!SI || StoredValue) && "No stored value provided for widened store");
+ assert((!LI || !StoredValue) && "Stored value provided for widened load");
+
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ bool CreateGatherScatter = !Consecutive;
+
+ auto &Builder = State.Builder;
+ InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
+ bool isMaskRequired = getMask();
+ if (isMaskRequired)
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ BlockInMaskParts[Part] = State.get(getMask(), Part);
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ // RunTimeVF = VScale * VF.getKnownMinValue()
+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+ Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+ // NumElt = -Part * RunTimeVF
+ Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ // LastLane = 1 - RunTimeVF
+ Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ PartPtr =
+ cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
+ PartPtr->setIsInBounds(InBounds);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ BlockInMaskParts[Part] =
+ Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
+ } else {
+ Value *Increment =
+ createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
+ // Handle Stores:
+ if (SI) {
+ State.ILV->setDebugLocFromInst(SI);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Instruction *NewSI = nullptr;
+ Value *StoredVal = State.get(StoredValue, Part);
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(getAddr(), Part);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+ MaskPart);
+ } else {
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't call resetVectorValue(StoredVal).
+ }
+ auto *VecPtr =
+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ if (isMaskRequired)
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+ BlockInMaskParts[Part]);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+ }
+ State.ILV->addMetadata(NewSI, SI);
+ }
+ return;
+ }
+
+ // Handle loads.
+ assert(LI && "Must have a load instruction");
+ State.ILV->setDebugLocFromInst(LI);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *NewLI;
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(getAddr(), Part);
+ NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+ nullptr, "wide.masked.gather");
+ State.ILV->addMetadata(NewLI, LI);
+ } else {
+ auto *VecPtr =
+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ if (isMaskRequired)
+ NewLI = Builder.CreateMaskedLoad(
+ DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
+ PoisonValue::get(DataTy), "wide.masked.load");
+ else
+ NewLI =
+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+
+ // Add metadata to the load, but setVectorValue to the reverse shuffle.
+ State.ILV->addMetadata(NewLI, LI);
+ if (Reverse)
+ NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
+ }
+
+ State.set(getVPSingleValue(), NewLI, Part);
+ }
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3ef0b794f68..95061e9053fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) {
return false;
}
+/// Checks if the given value is actually an undefined constant vector.
+static bool isUndefVector(const Value *V) {
+ if (isa<UndefValue>(V))
+ return true;
+ auto *C = dyn_cast<Constant>(V);
+ if (!C)
+ return false;
+ if (!C->containsUndefOrPoisonElement())
+ return false;
+ auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
+ if (!VecTy)
+ return false;
+ for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+ if (Constant *Elem = C->getAggregateElement(I))
+ if (!isa<UndefValue>(Elem))
+ return false;
+ }
+ return true;
+}
+
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
@@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) {
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
- auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ const auto *It =
+ find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
+ if (It == VL.end())
+ return None;
+ auto *EI0 = cast<ExtractElementInst>(*It);
if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
return None;
unsigned Size =
@@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
+ Mask.assign(VL.size(), UndefMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ // Undef can be represented as an undef element in a vector.
+ if (isa<UndefValue>(VL[I]))
+ continue;
auto *EI = cast<ExtractElementInst>(VL[I]);
+ if (isa<ScalableVectorType>(EI->getVectorOperandType()))
+ return None;
auto *Vec = EI->getVectorOperand();
+ // We can extractelement from undef or poison vector.
+ if (isUndefVector(Vec))
+ continue;
// All vector operands must have the same number of vector elements.
if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
return None;
+ if (isa<UndefValue>(EI->getIndexOperand()))
+ continue;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
- if (Idx->getValue().uge(Size)) {
- Mask.push_back(UndefMaskElem);
+ if (Idx->getValue().uge(Size))
continue;
- }
unsigned IntIdx = Idx->getValue().getZExtValue();
- Mask.push_back(IntIdx);
- // We can extractelement from undef or poison vector.
- if (isa<UndefValue>(Vec))
- continue;
+ Mask[I] = IntIdx;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
- if (!Vec1 || Vec1 == Vec)
+ if (!Vec1 || Vec1 == Vec) {
Vec1 = Vec;
- else if (!Vec2 || Vec2 == Vec)
+ } else if (!Vec2 || Vec2 == Vec) {
Vec2 = Vec;
- else
+ Mask[I] += Size;
+ } else {
return None;
+ }
if (CommonShuffleMode == Permute)
continue;
// If the extract index is not the same as the operation number, it is a
@@ -1680,6 +1712,28 @@ private:
return IsSame(Scalars, ReuseShuffleIndices);
}
+ /// \returns true if current entry has same operands as \p TE.
+ bool hasEqualOperands(const TreeEntry &TE) const {
+ if (TE.getNumOperands() != getNumOperands())
+ return false;
+ SmallBitVector Used(getNumOperands());
+ for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
+ unsigned PrevCount = Used.count();
+ for (unsigned K = 0; K < E; ++K) {
+ if (Used.test(K))
+ continue;
+ if (getOperand(K) == TE.getOperand(I)) {
+ Used.set(K);
+ break;
+ }
+ }
+ // Check if we actually found the matching operand.
+ if (PrevCount == Used.count())
+ return false;
+ }
+ return true;
+ }
+
/// \return Final vectorization factor for the node. Defined by the total
/// number of vectorized scalars, including those, used several times in the
/// entry and counted in the \a ReuseShuffleIndices, if any.
@@ -1773,6 +1827,12 @@ private:
return Operands[OpIdx];
}
+ /// \returns the \p OpIdx operand of this TreeEntry.
+ ArrayRef<Value *> getOperand(unsigned OpIdx) const {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ return Operands[OpIdx];
+ }
+
/// \returns the number of operands.
unsigned getNumOperands() const { return Operands.size(); }
@@ -2078,7 +2138,7 @@ private:
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
- SetVector<Instruction *> GatherSeq;
+ SetVector<Instruction *> GatherShuffleSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
bool IsGather) {
DenseMap<Value *, int> ExtractVectorsTys;
for (auto *V : VL) {
+ if (isa<UndefValue>(V))
+ continue;
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
- (IsGather && ScalarToTreeEntry.count(V)))
+ if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
continue;
auto *EE = cast<ExtractElementInst>(V);
- unsigned Idx = *getExtractIndex(EE);
+ Optional<unsigned> EEIdx = getExtractIndex(EE);
+ if (!EEIdx)
+ continue;
+ unsigned Idx = *EEIdx;
if (TTIRef.getNumberOfParts(VecTy) !=
TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
auto It =
@@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
for (const auto &Data : ExtractVectorsTys) {
auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
unsigned NumElts = VecTy->getNumElements();
+ if (Data.second % NumElts == 0)
+ continue;
if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
unsigned Idx = (Data.second / NumElts) * NumElts;
unsigned EENumElts = EEVTy->getNumElements();
@@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
}
- if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
- allSameBlock(VL) &&
- !isa<ScalableVectorType>(
- cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) {
+ if ((E->getOpcode() == Instruction::ExtractElement ||
+ all_of(E->Scalars,
+ [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V);
+ })) &&
+ allSameType(VL)) {
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
SmallVector<int> Mask;
@@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
return !is_contained(E->Scalars,
cast<Instruction>(V)->getOperand(0));
}));
- if (isa<UndefValue>(FirstInsert->getOperand(0))) {
+ if (isUndefVector(FirstInsert->getOperand(0))) {
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
} else {
SmallVector<int> InsertMask(NumElts);
@@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
InstructionCost VecCost = 0;
- if (Instruction::isBinaryOp(E->getOpcode())) {
+ // Try to find the previous shuffle node with the same operands and same
+ // main/alternate ops.
+ auto &&TryFindNodeWithEqualOperands = [this, E]() {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (TE.get() == E)
+ break;
+ if (TE->isAltShuffle() &&
+ ((TE->getOpcode() == E->getOpcode() &&
+ TE->getAltOpcode() == E->getAltOpcode()) ||
+ (TE->getOpcode() == E->getAltOpcode() &&
+ TE->getAltOpcode() == E->getOpcode())) &&
+ TE->hasEqualOperands(*E))
+ return true;
+ }
+ return false;
+ };
+ if (TryFindNodeWithEqualOperands()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: diamond match for alternate node found.\n";
+ E->dump();
+ });
+ // No need to add new vector costs here since we're going to reuse
+ // same main/alternate vector ops, just do different shuffling.
+ } else if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
@@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
[this](Value *V) { return EphValues.contains(V); }) &&
(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
TE->Scalars.size() < Limit ||
- (TE->getOpcode() == Instruction::ExtractElement &&
+ ((TE->getOpcode() == Instruction::ExtractElement ||
+ all_of(TE->Scalars,
+ [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V);
+ })) &&
isFixedVectorShuffle(TE->Scalars, Mask)) ||
(TE->State == TreeEntry::NeedToGather &&
TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
@@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
+ InsertElementInst *V) {
+ // Instructions must be from the same basic blocks.
+ if (VU->getParent() != V->getParent())
+ return false;
+ // Checks if 2 insertelements are from the same buildvector.
+ if (VU->getType() != V->getType())
+ return false;
+ // Multiple used inserts are separate nodes.
+ if (!VU->hasOneUse() && !V->hasOneUse())
+ return false;
+ auto *IE1 = VU;
+ auto *IE2 = V;
+ // Go through the vector operand of insertelement instructions trying to find
+ // either VU as the original vector for IE2 or V as the original vector for
+ // IE1.
+ do {
+ if (IE2 == VU || IE1 == V)
+ return true;
+ if (IE1) {
+ if (IE1 != VU && !IE1->hasOneUse())
+ IE1 = nullptr;
+ else
+ IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+ }
+ if (IE2) {
+ if (IE2 != V && !IE2->hasOneUse())
+ IE2 = nullptr;
+ else
+ IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+ }
+ } while (IE1 || IE2);
+ return false;
+}
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
- if (!ExtractCostCalculated.insert(EU.Scalar).second)
+ if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
+ !ExtractCostCalculated.insert(EU.Scalar).second)
continue;
// Uses by ephemeral values are free (because the ephemeral value will be
@@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
- if (isa_and_nonnull<InsertElementInst>(EU.User)) {
- if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
- Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+ if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+ Optional<int> InsertIdx = getInsertIndex(VU, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
- Value *VU = EU.User;
auto *It = find_if(FirstUsers, [VU](Value *V) {
- // Checks if 2 insertelements are from the same buildvector.
- if (VU->getType() != V->getType())
- return false;
- auto *IE1 = cast<InsertElementInst>(VU);
- auto *IE2 = cast<InsertElementInst>(V);
- // Go through of insertelement instructions trying to find either VU
- // as the original vector for IE2 or V as the original vector for IE1.
- do {
- if (IE1 == VU || IE2 == V)
- return true;
- if (IE1)
- IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
- if (IE2)
- IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
- } while (IE1 || IE2);
- return false;
+ return areTwoInsertFromSameBuildVector(VU,
+ cast<InsertElementInst>(V));
});
int VecId = -1;
if (It == FirstUsers.end()) {
VF.push_back(FTy->getNumElements());
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
- FirstUsers.push_back(EU.User);
+ // Find the insertvector, vectorized in tree, if any.
+ Value *Base = VU;
+ while (isa<InsertElementInst>(Base)) {
+ // Build the mask for the vectorized insertelement instructions.
+ if (const TreeEntry *E = getTreeEntry(Base)) {
+ VU = cast<InsertElementInst>(Base);
+ do {
+ int Idx = E->findLaneForValue(Base);
+ ShuffleMask.back()[Idx] = Idx;
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ } while (E == getTreeEntry(Base));
+ break;
+ }
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ }
+ FirstUsers.push_back(VU);
DemandedElts.push_back(APInt::getZero(VF.back()));
VecId = FirstUsers.size() - 1;
} else {
@@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
int Idx = *InsertIdx;
ShuffleMask[VecId][Idx] = EU.Lane;
DemandedElts[VecId].setBit(Idx);
+ continue;
}
}
@@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
- // For the very first element - simple shuffle of the source vector.
- int Limit = ShuffleMask[I].size() * 2;
- if (I == 0 &&
- all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
- !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
+ if (FirstUsers.size() == 1) {
+ int Limit = ShuffleMask.front().size() * 2;
+ if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
+ !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
InstructionCost C = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc,
- cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+ cast<FixedVectorType>(FirstUsers.front()->getType()),
+ ShuffleMask.front());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement external users "
<< *VectorizableTree.front()->Scalars.front() << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
Cost += C;
- continue;
}
- // Other elements - permutation of 2 vectors (the initial one and the next
- // Ith incoming vector).
- unsigned VF = ShuffleMask[I].size();
- for (unsigned Idx = 0; Idx < VF; ++Idx) {
- int &Mask = ShuffleMask[I][Idx];
- Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
- }
- InstructionCost C = TTI->getShuffleCost(
- TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
- ShuffleMask[I]);
- LLVM_DEBUG(
- dbgs()
- << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
- /*Insert*/ true,
- /*Extract*/ false);
+ cast<FixedVectorType>(FirstUsers.front()->getType()),
+ DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+ << " for insertelements gather.\n"
+ << "SLP: Current total cost = " << Cost << "\n");
Cost -= InsertCost;
+ } else if (FirstUsers.size() >= 2) {
+ unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
+ // Combined masks of the first 2 vectors.
+ SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
+ copy(ShuffleMask.front(), CombinedMask.begin());
+ APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
+ auto *VecTy = FixedVectorType::get(
+ cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
+ MaxVF);
+ for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
+ if (ShuffleMask[1][I] != UndefMaskElem) {
+ CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
+ CombinedDemandedElts.setBit(I);
+ }
+ }
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ InstructionCost InsertCost = TTI->getScalarizationOverhead(
+ VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
<< " for insertelements gather.\n"
<< "SLP: Current total cost = " << Cost << "\n");
+ Cost -= InsertCost;
+ for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
+ // Other elements - permutation of 2 vectors (the initial one and the
+ // next Ith incoming vector).
+ unsigned VF = ShuffleMask[I].size();
+ for (unsigned Idx = 0; Idx < VF; ++Idx) {
+ int Mask = ShuffleMask[I][Idx];
+ if (Mask != UndefMaskElem)
+ CombinedMask[Idx] = MaxVF + Mask;
+ else if (CombinedMask[Idx] != UndefMaskElem)
+ CombinedMask[Idx] = Idx;
+ }
+ for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
+ if (CombinedMask[Idx] != UndefMaskElem)
+ CombinedMask[Idx] = Idx;
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ InstructionCost InsertCost = TTI->getScalarizationOverhead(
+ cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+ /*Insert*/ true, /*Extract*/ false);
+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+ << " for insertelements gather.\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost -= InsertCost;
+ }
}
#ifndef NDEBUG
@@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
return Vec;
- GatherSeq.insert(InsElt);
+ GatherShuffleSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder {
const unsigned VF = 0;
bool IsFinalized = false;
SmallVector<int, 4> Mask;
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> &GatherShuffleSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> &CSEBlocks;
public:
- ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
- : Builder(Builder), VF(VF) {}
+ ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,
+ SetVector<Instruction *> &GatherShuffleSeq,
+ SetVector<BasicBlock *> &CSEBlocks)
+ : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),
+ CSEBlocks(CSEBlocks) {}
/// Adds a mask, inverting it before applying.
void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -5804,7 +5983,12 @@ public:
if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
return V;
- return Builder.CreateShuffleVector(V, Mask, "shuffle");
+ Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
}
~ShuffleInstructionBuilder() {
@@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
std::iota(UniformMask.begin(), UniformMask.end(), 0);
V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
}
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
return V;
}
@@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
VL = UniqueValues;
}
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
ShuffleBuilder.addMask(ReuseShuffleIndicies);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
return Vec;
}
@@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
if (E->State == TreeEntry::NeedToGather) {
if (E->getMainOp())
setInsertPointAfterBundle(E);
@@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
Entries.back()->VectorizedValue, Mask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
} else {
Vec = gather(E->Scalars);
}
if (NeedToShuffleReuses) {
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
E->VectorizedValue = Vec;
return Vec;
@@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IsIdentity &= *InsertIdx - Offset == I;
Mask[*InsertIdx - Offset] = I;
}
- if (!IsIdentity || NumElts != NumScalars)
+ if (!IsIdentity || NumElts != NumScalars) {
V = Builder.CreateShuffleVector(V, Mask);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
if ((!IsIdentity || Offset != 0 ||
- !isa<UndefValue>(FirstInsert->getOperand(0))) &&
+ !isUndefVector(FirstInsert->getOperand(0))) &&
NumElts != NumScalars) {
SmallVector<int> InsertMask(NumElts);
std::iota(InsertMask.begin(), InsertMask.end(), 0);
@@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateShuffleVector(
FirstInsert->getOperand(0), V, InsertMask,
cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
++NumVectorInstructions;
@@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V1 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
}
+ // Add V0 and V1 to later analysis to try to find and remove matching
+ // instruction, if any.
+ for (Value *V : {V0, V1}) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
// Create shuffle to take alternate operations from the vector.
// Also, gather up main and alt scalar ops to propagate IR flags to
@@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
- if (Instruction *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V)) {
V = propagateMetadata(I, E->Scalars);
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
@@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
void BoUpSLP::optimizeGatherSequence() {
- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *I : GatherSeq) {
+ for (Instruction *I : GatherShuffleSeq) {
if (isDeleted(I))
continue;
@@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() {
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (Op0 && L->contains(Op0))
- continue;
- if (Op1 && L->contains(Op1))
+ if (any_of(I->operands(), [L](Value *V) {
+ auto *OpI = dyn_cast<Instruction>(V);
+ return OpI && L->contains(OpI);
+ }))
continue;
// We can hoist this instruction. Move it to the pre-header.
@@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() {
return A->getDFSNumIn() < B->getDFSNumIn();
});
- // Perform O(N^2) search over the gather sequences and merge identical
+ // Less defined shuffles can be replaced by the more defined copies.
+ // Between two shuffles one is less defined if it has the same vector operands
+ // and its mask indeces are the same as in the first one or undefs. E.g.
+ // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
+ // poison, <0, 0, 0, 0>.
+ auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
+ SmallVectorImpl<int> &NewMask) {
+ if (I1->getType() != I2->getType())
+ return false;
+ auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
+ auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
+ if (!SI1 || !SI2)
+ return I1->isIdenticalTo(I2);
+ if (SI1->isIdenticalTo(SI2))
+ return true;
+ for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
+ if (SI1->getOperand(I) != SI2->getOperand(I))
+ return false;
+ // Check if the second instruction is more defined than the first one.
+ NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
+ ArrayRef<int> SM1 = SI1->getShuffleMask();
+ // Count trailing undefs in the mask to check the final number of used
+ // registers.
+ unsigned LastUndefsCnt = 0;
+ for (int I = 0, E = NewMask.size(); I < E; ++I) {
+ if (SM1[I] == UndefMaskElem)
+ ++LastUndefsCnt;
+ else
+ LastUndefsCnt = 0;
+ if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+ NewMask[I] != SM1[I])
+ return false;
+ if (NewMask[I] == UndefMaskElem)
+ NewMask[I] = SM1[I];
+ }
+ // Check if the last undefs actually change the final number of used vector
+ // registers.
+ return SM1.size() - LastUndefsCnt > 1 &&
+ TTI->getNumberOfParts(SI1->getType()) ==
+ TTI->getNumberOfParts(
+ FixedVectorType::get(SI1->getType()->getElementType(),
+ SM1.size() - LastUndefsCnt));
+ };
+ // Perform O(N^2) search over the gather/shuffle sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
@@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() {
if (isDeleted(&In))
continue;
if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
- !isa<ShuffleVectorInst>(&In))
+ !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
bool Replaced = false;
- for (Instruction *v : Visited) {
- if (In.isIdenticalTo(v) &&
- DT->dominates(v->getParent(), In.getParent())) {
- In.replaceAllUsesWith(v);
+ for (Instruction *&V : Visited) {
+ SmallVector<int> NewMask;
+ if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
+ DT->dominates(V->getParent(), In.getParent())) {
+ In.replaceAllUsesWith(V);
eraseInstruction(&In);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ Replaced = true;
+ break;
+ }
+ if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
+ GatherShuffleSeq.contains(V) &&
+ IsIdenticalOrLessDefined(V, &In, NewMask) &&
+ DT->dominates(In.getParent(), V->getParent())) {
+ In.moveAfter(V);
+ V->replaceAllUsesWith(&In);
+ eraseInstruction(V);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ V = &In;
Replaced = true;
break;
}
@@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
CSEBlocks.clear();
- GatherSeq.clear();
+ GatherShuffleSeq.clear();
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
@@ -8791,6 +9057,8 @@ private:
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ assert(RdxKind != RecurKind::FMulAdd &&
+ "A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
@@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<int> Mask;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- (llvm::all_of(BuildVectorOpds,
- [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ (llvm::all_of(
+ BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
isFixedVectorShuffle(BuildVectorOpds, Mask)))
return false;
@@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
return tryToVectorizeList(BuildVectorInsts, R);
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
- bool OpsChanged = false;
- SmallVector<Instruction *, 4> PostponedCmps;
- for (auto *I : reverse(Instructions)) {
- if (R.isDeleted(I))
- continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
- OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
- OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (isa<CmpInst>(I))
- PostponedCmps.push_back(I);
- }
- if (AtTerminator) {
- // Try to find reductions first.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- for (Value *Op : I->operands())
- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
- }
- // Try to vectorize operands as vector bundles.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- OpsChanged |= tryToVectorize(I, R);
- }
- Instructions.clear();
- } else {
- // Insert in reverse order since the PostponedCmps vector was filled in
- // reverse order.
- Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
- }
- return OpsChanged;
-}
-
template <typename T>
static bool
tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
@@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
return Changed;
}
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
+ bool AtTerminator) {
+ bool OpsChanged = false;
+ SmallVector<Instruction *, 4> PostponedCmps;
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I))
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (isa<CmpInst>(I))
+ PostponedCmps.push_back(I);
+ }
+ if (AtTerminator) {
+ // Try to find reductions first.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ for (Value *Op : I->operands())
+ OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
+ }
+ // Try to vectorize operands as vector bundles.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ OpsChanged |= tryToVectorize(I, R);
+ }
+ // Try to vectorize list of compares.
+ // Sort by type, compare predicate, etc.
+ // TODO: Add analysis on the operand opcodes (profitable to vectorize
+ // instructions with same/alternate opcodes/const values).
+ auto &&CompareSorter = [&R](Value *V, Value *V2) {
+ auto *CI1 = cast<CmpInst>(V);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType()->getTypeID() <
+ CI2->getOperand(0)->getType()->getTypeID())
+ return true;
+ if (CI1->getOperand(0)->getType()->getTypeID() >
+ CI2->getOperand(0)->getType()->getTypeID())
+ return false;
+ return CI1->getPredicate() < CI2->getPredicate() ||
+ (CI1->getPredicate() > CI2->getPredicate() &&
+ CI1->getPredicate() <
+ CmpInst::getSwappedPredicate(CI2->getPredicate()));
+ };
+
+ auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ auto *CI1 = cast<CmpInst>(V1);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
+ return false;
+ return CI1->getPredicate() == CI2->getPredicate() ||
+ CI1->getPredicate() ==
+ CmpInst::getSwappedPredicate(CI2->getPredicate());
+ };
+ auto Limit = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
+
+ SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
+ OpsChanged |= tryToVectorizeSequence<Value>(
+ Vals, Limit, CompareSorter, AreCompatibleCompares,
+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+ // Exclude possible reductions from other blocks.
+ bool ArePossiblyReducedInOtherBlock =
+ any_of(Candidates, [](Value *V) {
+ return any_of(V->users(), [V](User *U) {
+ return isa<SelectInst>(U) &&
+ cast<SelectInst>(U)->getParent() !=
+ cast<Instruction>(V)->getParent();
+ });
+ });
+ if (ArePossiblyReducedInOtherBlock)
+ return false;
+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ },
+ /*LimitForRegisterSize=*/true);
+ Instructions.clear();
+ } else {
+ // Insert in reverse order since the PostponedCmps vector was filled in
+ // reverse order.
+ Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
+ }
+ return OpsChanged;
+}
+
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 638467f94e1c..44b5e1df0839 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,
void VPInstruction::execute(VPTransformState &State) {
assert(!State.Instance && "VPInstruction executing an Instance");
+ IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+ State.Builder.setFastMathFlags(FMF);
for (unsigned Part = 0; Part < State.UF; ++Part)
generateInstruction(State, Part);
}
@@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
O << Instruction::getOpcodeName(getOpcode());
}
+ O << FMF;
+
for (const VPValue *Operand : operands()) {
O << " ";
Operand->printAsOperand(O, SlotTracker);
@@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
+ // Make sure the VPInstruction is a floating-point operation.
+ assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+ Opcode == Instruction::FCmp) &&
+ "this op can't take fast-math flags");
+ FMF = FMFNew;
+}
+
/// Generate the code inside the body of the vectorized loop. Assumes a single
/// LoopVectorBody basic-block was created for this. Introduce additional
/// basic-blocks as needed, and fill them all.
@@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = ";
getChainOp()->printAsOperand(O, SlotTracker);
- O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())
- << " (";
+ O << " +";
+ if (isa<FPMathOperator>(getUnderlyingInstr()))
+ O << getUnderlyingInstr()->getFastMathFlags();
+ O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
getVecOp()->printAsOperand(O, SlotTracker);
if (getCondOp()) {
O << ", ";
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 00ee31007cb7..810dd5030f95 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -59,6 +59,7 @@ class Value;
class VPBasicBlock;
class VPRegionBlock;
class VPlan;
+class VPReplicateRecipe;
class VPlanSlp;
/// Returns a calculation for the total number of elements for a given \p VF.
@@ -346,6 +347,10 @@ struct VPTransformState {
/// Pointer to the VPlan code is generated for.
VPlan *Plan;
+
+ /// Holds recipes that may generate a poison value that is used after
+ /// vectorization, even when their operands are not poison.
+ SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
};
/// VPUsers instance used by VPBlockBase to manage CondBit and the block
@@ -789,6 +794,7 @@ public:
private:
typedef unsigned char OpcodeTy;
OpcodeTy Opcode;
+ FastMathFlags FMF;
/// Utility method serving execute(): generates a single instance of the
/// modeled instruction.
@@ -802,13 +808,6 @@ public:
: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
- VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
- : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}),
- VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
- for (auto *I : Operands)
- addOperand(I->getVPSingleValue());
- }
-
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
: VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
@@ -870,6 +869,9 @@ public:
return true;
}
}
+
+ /// Set the fast-math flags.
+ void setFastMathFlags(FastMathFlags FMFNew);
};
/// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -1511,7 +1513,7 @@ public:
/// - For store: Address, stored value, optional mask
/// TODO: We currently execute only per-part unless a specific instance is
/// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
Instruction &Ingredient;
// Whether the loaded-from / stored-to addresses are consecutive.
@@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
public:
VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
bool Consecutive, bool Reverse)
- : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
+ : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),
+ VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),
Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
- new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
setMask(Mask);
}
@@ -1544,6 +1546,7 @@ public:
VPValue *StoredValue, VPValue *Mask,
bool Consecutive, bool Reverse)
: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
+ VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),
Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
setMask(Mask);