diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:49:08 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:43:49 +0000 |
commit | 4824e7fd18a1223177218d4aec1b3c6c5c4a444e (patch) | |
tree | 5ca6493b1b0bf6a41f257794c0116d5e50fbf37c /contrib/llvm-project/llvm/lib/Transforms | |
parent | 5e801ac66d24704442eba426ed13c3effb8a34e7 (diff) | |
parent | f65dcba83ce5035ab88a85fe17628b447eb56e1b (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
43 files changed, 2179 insertions, 1178 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b2c2efed7db8..ba7589c2bf60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -275,94 +276,64 @@ CleanupPointerRootUsers(GlobalVariable *GV, /// We just marked GV constant. Loop over all users of the global, cleaning up /// the obvious ones. This is largely just a quick scan over the use list to /// clean up the easy and obvious cruft. This returns true if it made a change. -static bool CleanupConstantGlobalUsers( - Value *V, Constant *Init, const DataLayout &DL, - function_ref<TargetLibraryInfo &(Function &)> GetTLI) { +static bool CleanupConstantGlobalUsers(GlobalVariable *GV, + const DataLayout &DL) { + Constant *Init = GV->getInitializer(); + SmallVector<User *, 8> WorkList(GV->users()); + SmallPtrSet<User *, 8> Visited; bool Changed = false; - // Note that we need to use a weak value handle for the worklist items. When - // we delete a constant array, we may also be holding pointer to one of its - // elements (or an element of one of its elements if we're dealing with an - // array of arrays) in the worklist. - SmallVector<WeakTrackingVH, 8> WorkList(V->users()); + + SmallVector<WeakTrackingVH> MaybeDeadInsts; + auto EraseFromParent = [&](Instruction *I) { + for (Value *Op : I->operands()) + if (auto *OpI = dyn_cast<Instruction>(Op)) + MaybeDeadInsts.push_back(OpI); + I->eraseFromParent(); + Changed = true; + }; while (!WorkList.empty()) { - Value *UV = WorkList.pop_back_val(); - if (!UV) + User *U = WorkList.pop_back_val(); + if (!Visited.insert(U).second) continue; - User *U = cast<User>(UV); + if (auto *BO = dyn_cast<BitCastOperator>(U)) + append_range(WorkList, BO->users()); + if (auto *ASC = dyn_cast<AddrSpaceCastOperator>(U)) + append_range(WorkList, ASC->users()); + else if (auto *GEP = dyn_cast<GEPOperator>(U)) + append_range(WorkList, GEP->users()); + else if (auto *LI = dyn_cast<LoadInst>(U)) { + // A load from zeroinitializer is always zeroinitializer, regardless of + // any applied offset. + if (Init->isNullValue()) { + LI->replaceAllUsesWith(Constant::getNullValue(LI->getType())); + EraseFromParent(LI); + continue; + } - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - if (Init) { - if (auto *Casted = - ConstantFoldLoadThroughBitcast(Init, LI->getType(), DL)) { - // Replace the load with the initializer. - LI->replaceAllUsesWith(Casted); - LI->eraseFromParent(); - Changed = true; + Value *PtrOp = LI->getPointerOperand(); + APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); + PtrOp = PtrOp->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true); + if (PtrOp == GV) { + if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(), + Offset, DL)) { + LI->replaceAllUsesWith(Value); + EraseFromParent(LI); } } } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { // Store must be unreachable or storing Init into the global. - SI->eraseFromParent(); - Changed = true; - } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { - if (CE->getOpcode() == Instruction::GetElementPtr) { - Constant *SubInit = nullptr; - if (Init) - SubInit = ConstantFoldLoadThroughGEPConstantExpr( - Init, CE, V->getType()->getPointerElementType(), DL); - Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI); - } else if ((CE->getOpcode() == Instruction::BitCast && - CE->getType()->isPointerTy()) || - CE->getOpcode() == Instruction::AddrSpaceCast) { - // Pointer cast, delete any stores and memsets to the global. - Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI); - } - - if (CE->use_empty()) { - CE->destroyConstant(); - Changed = true; - } - } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { - // Do not transform "gepinst (gep constexpr (GV))" here, because forming - // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold - // and will invalidate our notion of what Init is. - Constant *SubInit = nullptr; - if (!isa<ConstantExpr>(GEP->getOperand(0))) { - ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>( - ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction()))); - if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) - SubInit = ConstantFoldLoadThroughGEPConstantExpr( - Init, CE, V->getType()->getPointerElementType(), DL); - - // If the initializer is an all-null value and we have an inbounds GEP, - // we already know what the result of any load from that GEP is. - // TODO: Handle splats. - if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) - SubInit = Constant::getNullValue(GEP->getResultElementType()); - } - Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI); - - if (GEP->use_empty()) { - GEP->eraseFromParent(); - Changed = true; - } + EraseFromParent(SI); } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv - if (MI->getRawDest() == V) { - MI->eraseFromParent(); - Changed = true; - } - - } else if (Constant *C = dyn_cast<Constant>(U)) { - // If we have a chain of dead constantexprs or other things dangling from - // us, and if they are all dead, nuke them without remorse. - if (isSafeToDestroyConstant(C)) { - C->destroyConstant(); - CleanupConstantGlobalUsers(V, Init, DL, GetTLI); - return true; - } + if (getUnderlyingObject(MI->getRawDest()) == GV) + EraseFromParent(MI); } } + + Changed |= + RecursivelyDeleteTriviallyDeadInstructionsPermissive(MaybeDeadInsts); + GV->removeDeadConstantUsers(); return Changed; } @@ -889,7 +860,7 @@ static bool OptimizeAwayTrappingUsesOfLoads( Changed |= CleanupPointerRootUsers(GV, GetTLI); } else { Changed = true; - CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI); + CleanupConstantGlobalUsers(GV, DL); } if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); @@ -1557,8 +1528,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } else { // Delete any stores we can find to the global. We may not be able to // make it completely dead though. - Changed = - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + Changed = CleanupConstantGlobalUsers(GV, DL); } // If the global is dead now, delete it. @@ -1583,7 +1553,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } // Clean up any obviously simplifiable users now. - Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + Changed |= CleanupConstantGlobalUsers(GV, DL); // If the global is dead now, just nuke it. if (GV->use_empty()) { @@ -1628,7 +1598,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, GV->setInitializer(SOVConstant); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + CleanupConstantGlobalUsers(GV, DL); if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to " diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f342c35fa283..055ee6b50296 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1885,6 +1885,7 @@ private: OMPRTL___kmpc_barrier_simple_generic); ExternalizationRAII ThreadId(OMPInfoCache, OMPRTL___kmpc_get_hardware_thread_id_in_block); + ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size); registerAAs(IsModulePass); @@ -3727,12 +3728,37 @@ struct AAKernelInfoFunction : AAKernelInfo { CheckRWInst, *this, UsedAssumedInformationInCheckRWInst)) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + bool UsedAssumedInformationFromReachingKernels = false; if (!IsKernelEntry) { - updateReachingKernelEntries(A); updateParallelLevels(A); + bool AllReachingKernelsKnown = true; + updateReachingKernelEntries(A, AllReachingKernelsKnown); + UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown; + if (!ParallelLevels.isValidState()) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else if (!ReachingKernelEntries.isValidState()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else if (!SPMDCompatibilityTracker.empty()) { + // Check if all reaching kernels agree on the mode as we can otherwise + // not guard instructions. We might not be sure about the mode so we + // we cannot fix the internal spmd-zation state either. + int SPMD = 0, Generic = 0; + for (auto *Kernel : ReachingKernelEntries) { + auto &CBAA = A.getAAFor<AAKernelInfo>( + *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL); + if (CBAA.SPMDCompatibilityTracker.isValidState() && + CBAA.SPMDCompatibilityTracker.isAssumed()) + ++SPMD; + else + ++Generic; + if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint()) + UsedAssumedInformationFromReachingKernels = true; + } + if (SPMD != 0 && Generic != 0) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } } // Callback to check a call instruction. @@ -3779,7 +3805,8 @@ struct AAKernelInfoFunction : AAKernelInfo { // If we haven't used any assumed information for the SPMD state we can fix // it. if (!UsedAssumedInformationInCheckRWInst && - !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed) + !UsedAssumedInformationInCheckCallInst && + !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); return StateBefore == getState() ? ChangeStatus::UNCHANGED @@ -3788,7 +3815,8 @@ struct AAKernelInfoFunction : AAKernelInfo { private: /// Update info regarding reaching kernels. - void updateReachingKernelEntries(Attributor &A) { + void updateReachingKernelEntries(Attributor &A, + bool &AllReachingKernelsKnown) { auto PredCallSite = [&](AbstractCallSite ACS) { Function *Caller = ACS.getInstruction()->getFunction(); @@ -3808,10 +3836,9 @@ private: return true; }; - bool AllCallSitesKnown; if (!A.checkForAllCallSites(PredCallSite, *this, true /* RequireAllCallSites */, - AllCallSitesKnown)) + AllReachingKernelsKnown)) ReachingKernelEntries.indicatePessimisticFixpoint(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp index 7402e399a88a..2d717475ce7f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -641,8 +641,7 @@ PartialInlinerImpl::computeOutliningInfo(Function &F) const { if (!CandidateFound) return std::unique_ptr<FunctionOutliningInfo>(); - // Do sanity check of the entries: threre should not - // be any successors (not in the entry set) other than + // There should not be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} assert(OutliningInfo->Entries[0] == &F.front() && "Function Entry must be the first in Entries vector"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index a961c47a7501..b8fac9d47763 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -84,6 +84,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" #include <algorithm> @@ -173,6 +174,9 @@ static cl::opt<bool> cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on.")); +cl::opt<bool> + SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, + cl::desc("Sort profiled recursion by edge weights.")); static cl::opt<bool> ProfileSizeInline( "sample-profile-inline-size", cl::Hidden, cl::init(false), @@ -1648,6 +1652,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { SmallVector<uint32_t, 4> Weights; uint32_t MaxWeight = 0; Instruction *MaxDestInst; + // Since profi treats multiple edges (multiway branches) as a single edge, + // we need to distribute the computed weight among the branches. We do + // this by evenly splitting the edge weight among destinations. + DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; + std::vector<uint64_t> EdgeIndex; + if (SampleProfileUseProfi) { + EdgeIndex.resize(TI->getNumSuccessors()); + for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { + const BasicBlock *Succ = TI->getSuccessor(I); + EdgeIndex[I] = EdgeMultiplicity[Succ]; + EdgeMultiplicity[Succ]++; + } + } for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); @@ -1660,9 +1677,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); Weight = std::numeric_limits<uint32_t>::max(); } - // Weight is added by one to avoid propagation errors introduced by - // 0 weights. - Weights.push_back(static_cast<uint32_t>(Weight + 1)); + if (!SampleProfileUseProfi) { + // Weight is added by one to avoid propagation errors introduced by + // 0 weights. + Weights.push_back(static_cast<uint32_t>(Weight + 1)); + } else { + // Profi creates proper weights that do not require "+1" adjustments but + // we evenly split the weight among branches with the same destination. + uint64_t W = Weight / EdgeMultiplicity[Succ]; + // Rounding up, if needed, so that first branches are hotter. + if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) + W++; + Weights.push_back(static_cast<uint32_t>(W)); + } if (Weight != 0) { if (Weight > MaxWeight) { MaxWeight = Weight; @@ -1853,7 +1880,13 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG); scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get()); while (!CGI.isAtEnd()) { - for (ProfiledCallGraphNode *Node : *CGI) { + auto Range = *CGI; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator<ProfiledCallGraph *> SI(*CGI); + Range = *SI; + } + for (auto *Node : Range) { Function *F = SymbolMap.lookup(Node->Name); if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) FunctionOrderList.push_back(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 06c9bf650f37..dc55b5a31596 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1727,16 +1727,18 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - Value *A, *B, *C; + Value *A, *B, *C, *X, *Y; // (~(A | B) & C) | ... --> ... // (~(A & B) | C) & ... --> ... // TODO: One use checks are conservative. We just need to check that a total // number of multiple used values does not exceed reduction // in operations. - if (match(Op0, m_c_BinOp(FlippedOpcode, - m_Not(m_BinOp(Opcode, m_Value(A), m_Value(B))), - m_Value(C)))) { + if (match(Op0, + m_c_BinOp(FlippedOpcode, + m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A), + m_Value(B)))), + m_Value(C)))) { // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A) if (match(Op1, @@ -1776,6 +1778,21 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C))))))) return BinaryOperator::CreateNot(Builder.CreateBinOp( Opcode, Builder.CreateBinOp(FlippedOpcode, A, C), B)); + + // (~(A | B) & C) | ~(C | (A ^ B)) --> ~((A | B) & (C | (A ^ B))) + // Note, the pattern with swapped and/or is not handled because the + // result is more undefined than a source: + // (~(A & B) | C) & ~(C & (A ^ B)) --> (A ^ B ^ C) | ~(A | C) is invalid. + if (Opcode == Instruction::Or && Op0->hasOneUse() && + match(Op1, m_OneUse(m_Not(m_CombineAnd( + m_Value(Y), + m_c_BinOp(Opcode, m_Specific(C), + m_c_Xor(m_Specific(A), m_Specific(B)))))))) { + // X = ~(A | B) + // Y = (C | (A ^ B) + Value *Or = cast<BinaryOperator>(X)->getOperand(0); + return BinaryOperator::CreateNot(Builder.CreateAnd(Or, Y)); + } } return nullptr; @@ -2061,7 +2078,14 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Instruction *CastedAnd = foldCastedBitwiseLogic(I)) return CastedAnd; + if (Instruction *Sel = foldBinopOfSextBoolToSelect(I)) + return Sel; + // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>. + // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold + // with binop identity constant. But creating a select with non-constant + // arm may not be reversible due to poison semantics. Is that a good + // canonicalization? Value *A; if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) @@ -2322,11 +2346,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) { Value *Cond; Value *NotB; if (match(A, m_SExt(m_Value(Cond))) && - Cond->getType()->isIntOrIntVectorTy(1) && - match(B, m_OneUse(m_Not(m_Value(NotB))))) { - NotB = peekThroughBitcast(NotB, true); - if (match(NotB, m_SExt(m_Specific(Cond)))) + Cond->getType()->isIntOrIntVectorTy(1)) { + // A = sext i1 Cond; B = sext (not (i1 Cond)) + if (match(B, m_SExt(m_Not(m_Specific(Cond))))) return Cond; + + // A = sext i1 Cond; B = not ({bitcast} (sext (i1 Cond))) + // TODO: The one-use checks are unnecessary or misplaced. If the caller + // checked for uses on logic ops/casts, that should be enough to + // make this transform worthwhile. + if (match(B, m_OneUse(m_Not(m_Value(NotB))))) { + NotB = peekThroughBitcast(NotB, true); + if (match(NotB, m_SExt(m_Specific(Cond)))) + return Cond; + } } // All scalar (and most vector) possibilities should be handled now. @@ -2569,7 +2602,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { return replaceInstUsesWith(I, V); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (I.getType()->isIntOrIntVectorTy(1)) { + Type *Ty = I.getType(); + if (Ty->isIntOrIntVectorTy(1)) { if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { if (auto *I = foldAndOrOfSelectUsingImpliedCond(Op1, *SI0, /* IsAnd */ false)) @@ -2602,7 +2636,16 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0 // The check for a 'not' op is for efficiency (if Y is known zero --> ~X). Value *Or = Builder.CreateOr(X, Y); - return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV)); + return BinaryOperator::CreateXor(Or, ConstantInt::get(Ty, *CV)); + } + + // If the operands have no common bits set: + // or (mul X, Y), X --> add (mul X, Y), X --> mul X, (Y + 1) + if (match(&I, + m_c_Or(m_OneUse(m_Mul(m_Value(X), m_Value(Y))), m_Deferred(X))) && + haveNoCommonBitsSet(Op0, Op1, DL)) { + Value *IncrementY = Builder.CreateAdd(Y, ConstantInt::get(Ty, 1)); + return BinaryOperator::CreateMul(X, IncrementY); } // (A & C) | (B & D) @@ -2635,14 +2678,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // iff (C0 & C1) == 0 and (X & ~C0) == 0 if (match(A, m_c_Or(m_Value(X), m_Specific(B))) && MaskedValueIsZero(X, ~*C0, 0, &I)) { - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(A, C01); } // (A & C0) | ((X | A) & C1) --> (X | A) & (C0 | C1) // iff (C0 & C1) == 0 and (X & ~C1) == 0 if (match(B, m_c_Or(m_Value(X), m_Specific(A))) && MaskedValueIsZero(X, ~*C1, 0, &I)) { - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(B, C01); } // ((X | C2) & C0) | ((X | C3) & C1) --> (X | C2 | C3) & (C0 | C1) @@ -2652,7 +2695,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { match(B, m_Or(m_Specific(X), m_APInt(C3))) && (*C2 & ~*C0).isZero() && (*C3 & ~*C1).isZero()) { Value *Or = Builder.CreateOr(X, *C2 | *C3, "bitfield"); - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(Or, C01); } } @@ -2788,13 +2831,20 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *CastedOr = foldCastedBitwiseLogic(I)) return CastedOr; + if (Instruction *Sel = foldBinopOfSextBoolToSelect(I)) + return Sel; + // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>. + // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold + // with binop identity constant. But creating a select with non-constant + // arm may not be reversible due to poison semantics. Is that a good + // canonicalization? if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) - return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); + return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op1); if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) - return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); + return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op0); // Note: If we've gotten to the point of visiting the outer OR, then the // inner one couldn't be simplified. If it was a constant, then it won't @@ -2826,7 +2876,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X) --> X s> Y ? -1 : X. { Value *X, *Y; - Type *Ty = I.getType(); if (match(&I, m_c_Or(m_OneUse(m_AShr( m_NSWSub(m_Value(Y), m_Value(X)), m_SpecificInt(Ty->getScalarSizeInBits() - 1))), @@ -2876,7 +2925,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (match(&I, m_c_Or(m_Add(m_Shl(m_One(), m_Value(X)), m_AllOnes()), m_Shl(m_One(), m_Deferred(X)))) && match(&I, m_c_Or(m_OneUse(m_Value()), m_Value()))) { - Type *Ty = X->getType(); Value *Sub = Builder.CreateSub( ConstantInt::get(Ty, Ty->getScalarSizeInBits() - 1), X); return BinaryOperator::CreateLShr(Constant::getAllOnesValue(Ty), Sub); @@ -3601,6 +3649,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A)))) return BinaryOperator::CreateOr(A, B); + // (~A | B) ^ A --> ~(A & B) + if (match(Op0, m_OneUse(m_c_Or(m_Not(m_Specific(Op1)), m_Value(B))))) + return BinaryOperator::CreateNot(Builder.CreateAnd(Op1, B)); + + // A ^ (~A | B) --> ~(A & B) + if (match(Op1, m_OneUse(m_c_Or(m_Not(m_Specific(Op0)), m_Value(B))))) + return BinaryOperator::CreateNot(Builder.CreateAnd(Op0, B)); + // (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants. // TODO: Loosen one-use restriction if common operand is a constant. Value *D; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index bfa7bfa2290a..7da2669e1d13 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2641,7 +2641,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { ArgNo++; } - assert(ArgNo == Call.arg_size() && "sanity check"); + assert(ArgNo == Call.arg_size() && "Call arguments not processed correctly."); if (!ArgNos.empty()) { AttributeList AS = Call.getAttributes(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index ca87477c5d81..33f217659c01 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2771,7 +2771,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))), m_Value(Y), m_ConstantInt(IndexC)))) && DestTy->isIntegerTy() && X->getType() == DestTy && - isDesirableIntType(BitWidth)) { + Y->getType()->isIntegerTy() && isDesirableIntType(BitWidth)) { // Adjust for big endian - the LSBs are at the high index. if (DL.isBigEndian()) IndexC = SrcVTy->getNumElements() - 1 - IndexC; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 7a9e177f19da..ed53b88aed61 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1894,23 +1895,6 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp, return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1)))); } - // (X & C2) == 0 -> (trunc X) >= 0 - // (X & C2) != 0 -> (trunc X) < 0 - // iff C2 is a power of 2 and it masks the sign bit of a legal integer type. - const APInt *C2; - if (And->hasOneUse() && C.isZero() && match(Y, m_APInt(C2))) { - int32_t ExactLogBase2 = C2->exactLogBase2(); - if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { - Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1); - if (auto *AndVTy = dyn_cast<VectorType>(And->getType())) - NTy = VectorType::get(NTy, AndVTy->getElementCount()); - Value *Trunc = Builder.CreateTrunc(X, NTy); - auto NewPred = - Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE : CmpInst::ICMP_SLT; - return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy)); - } - } - return nullptr; } @@ -2803,7 +2787,8 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, PredB, cast<Constant>(RHS2)); if (!FlippedStrictness) return false; - assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); + assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && + "basic correctness failure"); RHS2 = FlippedStrictness->second; // And kind-of perform the result swap. std::swap(Less, Greater); @@ -4614,7 +4599,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) { static Instruction *foldICmpWithTrunc(ICmpInst &ICmp, InstCombiner::BuilderTy &Builder) { - const ICmpInst::Predicate Pred = ICmp.getPredicate(); + ICmpInst::Predicate Pred = ICmp.getPredicate(); Value *Op0 = ICmp.getOperand(0), *Op1 = ICmp.getOperand(1); // Try to canonicalize trunc + compare-to-constant into a mask + cmp. @@ -4624,41 +4609,31 @@ static Instruction *foldICmpWithTrunc(ICmpInst &ICmp, if (!match(Op0, m_OneUse(m_Trunc(m_Value(X)))) || !match(Op1, m_APInt(C))) return nullptr; + // This matches patterns corresponding to tests of the signbit as well as: + // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?) + // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?) + APInt Mask; + if (decomposeBitTestICmp(Op0, Op1, Pred, X, Mask, true /* WithTrunc */)) { + Value *And = Builder.CreateAnd(X, Mask); + Constant *Zero = ConstantInt::getNullValue(X->getType()); + return new ICmpInst(Pred, And, Zero); + } + unsigned SrcBits = X->getType()->getScalarSizeInBits(); - if (Pred == ICmpInst::ICMP_ULT) { - if (C->isPowerOf2()) { - // If C is a power-of-2 (one set bit): - // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?) - Constant *MaskC = ConstantInt::get(X->getType(), (-*C).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - Constant *Zero = ConstantInt::getNullValue(X->getType()); - return new ICmpInst(ICmpInst::ICMP_EQ, And, Zero); - } + if (Pred == ICmpInst::ICMP_ULT && C->isNegatedPowerOf2()) { // If C is a negative power-of-2 (high-bit mask): // (trunc X) u< C --> (X & C) != C (are any masked-high-bits clear?) - if (C->isNegatedPowerOf2()) { - Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC); - } + Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits)); + Value *And = Builder.CreateAnd(X, MaskC); + return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC); } - if (Pred == ICmpInst::ICMP_UGT) { - // If C is a low-bit-mask (C+1 is a power-of-2): - // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?) - if (C->isMask()) { - Constant *MaskC = ConstantInt::get(X->getType(), (~*C).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - Constant *Zero = ConstantInt::getNullValue(X->getType()); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } + if (Pred == ICmpInst::ICMP_UGT && (~*C).isPowerOf2()) { // If C is not-of-power-of-2 (one clear bit): // (trunc X) u> C --> (X & (C+1)) == C+1 (are all masked-high-bits set?) - if ((~*C).isPowerOf2()) { - Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC); - } + Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits)); + Value *And = Builder.CreateAnd(X, MaskC); + return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC); } return nullptr; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 72e1b21e8d49..20c75188ec9f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -319,6 +319,7 @@ private: Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt); Instruction *foldCastedBitwiseLogic(BinaryOperator &I); + Instruction *foldBinopOfSextBoolToSelect(BinaryOperator &I); Instruction *narrowBinOp(TruncInst &Trunc); Instruction *narrowMaskedBinOp(BinaryOperator &And); Instruction *narrowMathIfNoOverflow(BinaryOperator &I); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp index 7dc516c6fdc3..42ba4a34a5a9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp @@ -403,7 +403,7 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) { NonNegatedOps.emplace_back(Op); // Just record which operand that was. } assert((NegatedOps.size() + NonNegatedOps.size()) == 2 && - "Internal consistency sanity check."); + "Internal consistency check failed."); // Did we manage to sink negation into both of the operands? if (NegatedOps.size() == 2) // Then we get to keep the `add`! return Builder.CreateAdd(NegatedOps[0], NegatedOps[1], diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 4a1e82ae9c1d..518d3952dce5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -246,12 +246,16 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, static unsigned getSelectFoldableOperands(BinaryOperator *I) { switch (I->getOpcode()) { case Instruction::Add: + case Instruction::FAdd: case Instruction::Mul: + case Instruction::FMul: case Instruction::And: case Instruction::Or: case Instruction::Xor: return 3; // Can fold through either operand. case Instruction::Sub: // Can only fold on the amount subtracted. + case Instruction::FSub: + case Instruction::FDiv: // Can only fold on the divisor amount. case Instruction::Shl: // Can only fold on the shift amount. case Instruction::LShr: case Instruction::AShr: diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 47b6dcb67a78..1f81624f79e7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -967,6 +967,29 @@ Value *InstCombinerImpl::dyn_castNegVal(Value *V) const { return nullptr; } +/// A binop with a constant operand and a sign-extended boolean operand may be +/// converted into a select of constants by applying the binary operation to +/// the constant with the two possible values of the extended boolean (0 or -1). +Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) { + // TODO: Handle non-commutative binop (constant is operand 0). + // TODO: Handle zext. + // TODO: Peek through 'not' of cast. + Value *BO0 = BO.getOperand(0); + Value *BO1 = BO.getOperand(1); + Value *X; + Constant *C; + if (!match(BO0, m_SExt(m_Value(X))) || !match(BO1, m_ImmConstant(C)) || + !X->getType()->isIntOrIntVectorTy(1)) + return nullptr; + + // bo (sext i1 X), C --> select X, (bo -1, C), (bo 0, C) + Constant *Ones = ConstantInt::getAllOnesValue(BO.getType()); + Constant *Zero = ConstantInt::getNullValue(BO.getType()); + Constant *TVal = ConstantExpr::get(BO.getOpcode(), Ones, C); + Constant *FVal = ConstantExpr::get(BO.getOpcode(), Zero, C); + return SelectInst::Create(X, TVal, FVal); +} + static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, InstCombiner::BuilderTy &Builder) { if (auto *Cast = dyn_cast<CastInst>(&I)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b56329ad76ae..bd2dc8d639fc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file is a part of AddressSanitizer, an address sanity checker. +// This file is a part of AddressSanitizer, an address basic correctness +// checker. // Details of the algorithm: // https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm // diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 62c265e40dab..8d3bc1383e96 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// // /// \file -/// This file is a part of HWAddressSanitizer, an address sanity checker -/// based on tagged addressing. +/// This file is a part of HWAddressSanitizer, an address basic correctness +/// checker based on tagged addressing. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 36a66e096382..d1d3b8ffdf7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -64,10 +64,10 @@ cl::opt<bool> DoHashBasedCounterSplit( cl::desc("Rename counter variable of a comdat function based on cfg hash"), cl::init(true)); -cl::opt<bool> RuntimeCounterRelocation( - "runtime-counter-relocation", - cl::desc("Enable relocating counters at runtime."), - cl::init(false)); +cl::opt<bool> + RuntimeCounterRelocation("runtime-counter-relocation", + cl::desc("Enable relocating counters at runtime."), + cl::init(false)); cl::opt<bool> ValueProfileStaticAlloc( "vp-static-alloc", @@ -331,8 +331,9 @@ private: // Check whether the loop satisfies the basic conditions needed to perform // Counter Promotions. - bool isPromotionPossible(Loop *LP, - const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) { + bool + isPromotionPossible(Loop *LP, + const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) { // We can't insert into a catchswitch. if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) { return isa<CatchSwitchInst>(Exit->getTerminator()); @@ -421,13 +422,13 @@ PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { } char InstrProfilingLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN( - InstrProfilingLegacyPass, "instrprof", - "Frontend instrumentation-based coverage lowering.", false, false) +INITIALIZE_PASS_BEGIN(InstrProfilingLegacyPass, "instrprof", + "Frontend instrumentation-based coverage lowering.", + false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END( - InstrProfilingLegacyPass, "instrprof", - "Frontend instrumentation-based coverage lowering.", false, false) +INITIALIZE_PASS_END(InstrProfilingLegacyPass, "instrprof", + "Frontend instrumentation-based coverage lowering.", false, + false) ModulePass * llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options, @@ -634,13 +635,9 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { GlobalVariable *Name = Ind->getName(); uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); uint64_t Index = Ind->getIndex()->getZExtValue(); - auto It = ProfileDataMap.find(Name); - if (It == ProfileDataMap.end()) { - PerFunctionProfileData PD; - PD.NumValueSites[ValueKind] = Index + 1; - ProfileDataMap[Name] = PD; - } else if (It->second.NumValueSites[ValueKind] <= Index) - It->second.NumValueSites[ValueKind] = Index + 1; + auto &PD = ProfileDataMap[Name]; + PD.NumValueSites[ValueKind] = + std::max(PD.NumValueSites[ValueKind], (uint32_t)(Index + 1)); } void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { @@ -703,14 +700,15 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { LoadInst *LI = dyn_cast<LoadInst>(&I); if (!LI) { IRBuilder<> Builder(&I); - GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName()); + GlobalVariable *Bias = + M->getGlobalVariable(getInstrProfCounterBiasVarName()); if (!Bias) { // Compiler must define this variable when runtime counter relocation // is being used. Runtime has a weak external reference that is used // to check whether that's the case or not. - Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage, - Constant::getNullValue(Int64Ty), - getInstrProfCounterBiasVarName()); + Bias = new GlobalVariable( + *M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage, + Constant::getNullValue(Int64Ty), getInstrProfCounterBiasVarName()); Bias->setVisibility(GlobalVariable::HiddenVisibility); // A definition that's weak (linkonce_odr) without being in a COMDAT // section wouldn't lead to link errors, but it would lead to a dead @@ -839,8 +837,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { return false; // Use linker script magic to get data/cnts/name start/end. if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() || - TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || - TT.isOSWindows()) + TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || TT.isOSWindows()) return false; return true; @@ -849,13 +846,9 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { GlobalVariable * InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { GlobalVariable *NamePtr = Inc->getName(); - auto It = ProfileDataMap.find(NamePtr); - PerFunctionProfileData PD; - if (It != ProfileDataMap.end()) { - if (It->second.RegionCounters) - return It->second.RegionCounters; - PD = It->second; - } + auto &PD = ProfileDataMap[NamePtr]; + if (PD.RegionCounters) + return PD.RegionCounters; // Match the linkage and visibility of the name global. Function *Fn = Inc->getParent()->getParent(); @@ -922,6 +915,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { CounterPtr->setAlignment(Align(8)); MaybeSetComdat(CounterPtr); CounterPtr->setLinkage(Linkage); + PD.RegionCounters = CounterPtr; auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); // Allocate statically the array of pointers to value profile nodes for @@ -1000,9 +994,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { MaybeSetComdat(Data); Data->setLinkage(Linkage); - PD.RegionCounters = CounterPtr; PD.DataVar = Data; - ProfileDataMap[NamePtr] = PD; // Mark the data variable as used so that it isn't stripped out. CompilerUsedVars.push_back(Data); @@ -1013,7 +1005,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { // Collect the referenced names to be used by emitNameData. ReferencedNames.push_back(NamePtr); - return CounterPtr; + return PD.RegionCounters; } void InstrProfiling::emitVNodes() { @@ -1078,8 +1070,8 @@ void InstrProfiling::emitNameData() { } auto &Ctx = M->getContext(); - auto *NamesVal = ConstantDataArray::getString( - Ctx, StringRef(CompressedNameStr), false); + auto *NamesVal = + ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); NamesVar = new GlobalVariable(*M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index f98e39d751f4..180012198c42 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -110,7 +110,7 @@ namespace { /// the module. struct ThreadSanitizer { ThreadSanitizer() { - // Sanity check options and warn user. + // Check options and warn user. if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) { errs() << "warning: Option -tsan-compound-read-before-write has no effect " diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 74e4eb07b219..4921209f041b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -94,11 +94,9 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, return false; } else if (const auto *CS = dyn_cast<CallBase>(Inst)) { // For calls, just check the arguments (and not the callee operand). - for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) { - const Value *Op = *OI; + for (const Value *Op : CS->args()) if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op)) return true; - } return false; } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // Special-case stores, because we don't care about the stored value, just diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index ca9567dc7ac8..a3fd97079b1d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -52,6 +52,11 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" +static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned( + "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden, + cl::desc("Enables canonicalization of signed relational predicates to " + "unsigned (e.g. sgt => ugt)")); + STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); @@ -64,7 +69,8 @@ STATISTIC(NumSDivSRemsNarrowed, STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); STATISTIC(NumUDivURemsNarrowed, "Number of udivs/urems whose width was decreased"); -STATISTIC(NumAShrs, "Number of ashr converted to lshr"); +STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr"); +STATISTIC(NumAShrsRemoved, "Number of ashr removed"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumSExt, "Number of sext converted to zext"); STATISTIC(NumSICmps, "Number of signed icmp preds simplified to unsigned"); @@ -297,6 +303,9 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { } static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { + if (!CanonicalizeICmpPredicatesToUnsigned) + return false; + // Only for signed relational comparisons of scalar integers. if (Cmp->getType()->isVectorTy() || !Cmp->getOperand(0)->getType()->isIntegerTy()) @@ -376,13 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); - APInt Low = - APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits()); - APInt High = - APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits()); - - SwitchInst::CaseIt CI = SI->case_begin(); - for (auto CE = SI->case_end(); CI != CE;) { + for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); LazyValueInfo::Tristate State = LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, @@ -415,28 +418,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, break; } - // Get Lower/Upper bound from switch cases. - Low = APIntOps::smin(Case->getValue(), Low); - High = APIntOps::smax(Case->getValue(), High); - // Increment the case iterator since we didn't delete it. ++CI; } - - // Try to simplify default case as unreachable - if (CI == SI->case_end() && SI->getNumCases() != 0 && - !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) { - const ConstantRange SIRange = - LVI->getConstantRange(SI->getCondition(), SI); - - // If the numbered switch cases cover the entire range of the condition, - // then the default case is not reachable. - if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High && - SI->getNumCases() == High - Low + 1) { - createUnreachableSwitchDefault(SI, &DTU); - Changed = true; - } - } } if (Changed) @@ -688,7 +672,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) { ArgNo++; } - assert(ArgNo == CB.arg_size() && "sanity check"); + assert(ArgNo == CB.arg_size() && "Call arguments not processed correctly."); if (ArgNos.empty()) return Changed; @@ -954,10 +938,22 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { if (SDI->getType()->isVectorTy()) return false; + ConstantRange LRange = LVI->getConstantRange(SDI->getOperand(0), SDI); + unsigned OrigWidth = SDI->getType()->getIntegerBitWidth(); + ConstantRange NegOneOrZero = + ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1)); + if (NegOneOrZero.contains(LRange)) { + // ashr of -1 or 0 never changes the value, so drop the whole instruction + ++NumAShrsRemoved; + SDI->replaceAllUsesWith(SDI->getOperand(0)); + SDI->eraseFromParent(); + return true; + } + if (!isNonNegative(SDI->getOperand(0), LVI, SDI)) return false; - ++NumAShrs; + ++NumAShrsConverted; auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), SDI->getName(), SDI); BO->setDebugLoc(SDI->getDebugLoc()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a8ec8bb97970..e0d3a6accadd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -159,52 +159,22 @@ static cl::opt<unsigned> MemorySSAPathCheckLimit( cl::desc("The maximum number of blocks to check when trying to prove that " "all paths to an exit go through a killing block (default = 50)")); +// This flags allows or disallows DSE to optimize MemorySSA during its +// traversal. Note that DSE optimizing MemorySSA may impact other passes +// downstream of the DSE invocation and can lead to issues not being +// reproducible in isolation (i.e. when MemorySSA is built from scratch). In +// those cases, the flag can be used to check if DSE's MemorySSA optimizations +// impact follow-up passes. +static cl::opt<bool> + OptimizeMemorySSA("dse-optimize-memoryssa", cl::init(true), cl::Hidden, + cl::desc("Allow DSE to optimize memory accesses.")); + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// using OverlapIntervalsTy = std::map<int64_t, int64_t>; using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; -/// Does this instruction write some memory? This only returns true for things -/// that we can analyze with other helpers below. -static bool hasAnalyzableMemoryWrite(Instruction *I, - const TargetLibraryInfo &TLI) { - if (isa<StoreInst>(I)) - return true; - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - default: - return false; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - case Intrinsic::memcpy_inline: - case Intrinsic::memcpy_element_unordered_atomic: - case Intrinsic::memmove_element_unordered_atomic: - case Intrinsic::memset_element_unordered_atomic: - case Intrinsic::init_trampoline: - case Intrinsic::lifetime_end: - case Intrinsic::masked_store: - return true; - } - } - if (auto *CB = dyn_cast<CallBase>(I)) { - LibFunc LF; - if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { - switch (LF) { - case LibFunc_strcpy: - case LibFunc_strncpy: - case LibFunc_strcat: - case LibFunc_strncat: - return true; - default: - return false; - } - } - } - return false; -} - /// If the value of this instruction and the memory it writes to is unused, may /// we delete this instruction? static bool isRemovable(Instruction *I) { @@ -214,7 +184,7 @@ static bool isRemovable(Instruction *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate"); + default: llvm_unreachable("Does not have LocForWrite"); case Intrinsic::lifetime_end: // Never remove dead lifetime_end's, e.g. because it is followed by a // free. @@ -296,6 +266,7 @@ enum OverwriteResult { OW_End, OW_PartialEarlierWithFullLater, OW_MaybePartial, + OW_None, OW_Unknown }; @@ -841,7 +812,7 @@ struct DSEState { /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. - DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs; + MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs; // Class contains self-reference, make sure it's not copied/moved. DSEState(const DSEState &) = delete; @@ -889,6 +860,7 @@ struct DSEState { /// Return OW_MaybePartial if \p KillingI does not completely overwrite /// \p DeadI, but they both write to the same underlying object. In that /// case, use isPartialOverwrite to check if \p KillingI partially overwrites + /// \p DeadI. Returns 'OR_None' if \p KillingI is known to not overwrite the /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined. OverwriteResult isOverwrite(const Instruction *KillingI, const Instruction *DeadI, @@ -951,8 +923,16 @@ struct DSEState { // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. - if (DeadUndObj != KillingUndObj) + if (DeadUndObj != KillingUndObj) { + // Non aliasing stores to different objects don't overlap. Note that + // if the killing store is known to overwrite whole object (out of + // bounds access overwrites whole object as well) then it is assumed to + // completely overwrite any store to the same object even if they don't + // actually alias (see next check). + if (AAR == AliasResult::NoAlias) + return OW_None; return OW_Unknown; + } // If the KillingI store is to a recognizable object, get its size. uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); @@ -1006,9 +986,8 @@ struct DSEState { return OW_MaybePartial; } - // Can reach here only if accesses are known not to overlap. There is no - // dedicated code to indicate no overlap so signal "unknown". - return OW_Unknown; + // Can reach here only if accesses are known not to overlap. + return OW_None; } bool isInvisibleToCallerAfterRet(const Value *V) { @@ -1304,6 +1283,15 @@ struct DSEState { Instruction *KillingI = KillingDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " trying to get dominating access\n"); + // Only optimize defining access of KillingDef when directly starting at its + // defining access. The defining access also must only access KillingLoc. At + // the moment we only support instructions with a single write location, so + // it should be sufficient to disable optimizations for instructions that + // also read from memory. + bool CanOptimize = OptimizeMemorySSA && + KillingDef->getDefiningAccess() == StartAccess && + !KillingI->mayReadFromMemory(); + // Find the next clobbering Mod access for DefLoc, starting at StartAccess. Optional<MemoryLocation> CurrentLoc; for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) { @@ -1345,8 +1333,10 @@ struct DSEState { Instruction *CurrentI = CurrentDef->getMemoryInst(); if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj), - TLI)) + TLI)) { + CanOptimize = false; continue; + } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing @@ -1380,15 +1370,13 @@ struct DSEState { return None; } - // If Current cannot be analyzed or is not removable, check the next - // candidate. - if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) - continue; - - // If Current does not have an analyzable write location, skip it + // If Current does not have an analyzable write location or is not + // removable, skip it. CurrentLoc = getLocForWriteEx(CurrentI); - if (!CurrentLoc) + if (!CurrentLoc || !isRemovable(CurrentI)) { + CanOptimize = false; continue; + } // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same @@ -1396,6 +1384,7 @@ struct DSEState { if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); WalkerStepLimit -= 1; + CanOptimize = false; continue; } @@ -1403,16 +1392,32 @@ struct DSEState { // If the killing def is a memory terminator (e.g. lifetime.end), check // the next candidate if the current Current does not write the same // underlying object as the terminator. - if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) + if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) { + CanOptimize = false; continue; + } } else { int64_t KillingOffset = 0; int64_t DeadOffset = 0; auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc, KillingOffset, DeadOffset); + if (CanOptimize) { + // CurrentDef is the earliest write clobber of KillingDef. Use it as + // optimized access. Do not optimize if CurrentDef is already the + // defining access of KillingDef. + if (CurrentDef != KillingDef->getDefiningAccess() && + (OR == OW_Complete || OR == OW_MaybePartial)) + KillingDef->setOptimized(CurrentDef); + + // Once a may-aliasing def is encountered do not set an optimized + // access. + if (OR != OW_None) + CanOptimize = false; + } + // If Current does not write to the same object as KillingDef, check // the next candidate. - if (OR == OW_Unknown) + if (OR == OW_Unknown || OR == OW_None) continue; else if (OR == OW_MaybePartial) { // If KillingDef only partially overwrites Current, check the next @@ -1421,6 +1426,7 @@ struct DSEState { // which are less likely to be removable in the end. if (PartialLimit <= 1) { WalkerStepLimit -= 1; + LLVM_DEBUG(dbgs() << " ... reached partial limit ... continue with next access\n"); continue; } PartialLimit -= 1; @@ -1922,7 +1928,14 @@ struct DSEState { if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) || !isRemovable(Def->getMemoryInst())) continue; - auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess()); + MemoryDef *UpperDef; + // To conserve compile-time, we avoid walking to the next clobbering def. + // Instead, we just try to get the optimized access, if it exists. DSE + // will try to optimize defs during the earlier traversal. + if (Def->isOptimized()) + UpperDef = dyn_cast<MemoryDef>(Def->getOptimized()); + else + UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess()); if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef)) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index ae2fe2767074..7001d330fce0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1951,7 +1951,6 @@ bool IndVarSimplify::run(Loop *L) { // using it. if (!DisableLFTR) { BasicBlock *PreHeader = L->getLoopPreheader(); - BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -1987,7 +1986,7 @@ bool IndVarSimplify::run(Loop *L) { // Avoid high cost expansions. Note: This heuristic is questionable in // that our definition of "high cost" is not exactly principled. if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, - TTI, PreHeaderBR)) + TTI, PreHeader->getTerminator())) continue; // Check preconditions for proper SCEVExpander operation. SCEV does not diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index bf714d167670..6f97f3e93123 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -486,7 +486,7 @@ bool LoopInvariantCodeMotion::runOnLoop( // Check that neither this loop nor its parent have had LCSSA broken. LICM is // specifically moving instructions across the loop boundary and so it is - // especially in need of sanity checking here. + // especially in need of basic functional correctness checking here. assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!"); assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) && "Parent loop not left in LCSSA form after LICM!"); @@ -1860,6 +1860,7 @@ class LoopPromoter : public LoadAndStorePromoter { bool UnorderedAtomic; AAMDNodes AATags; ICFLoopSafetyInfo &SafetyInfo; + bool CanInsertStoresInExitBlocks; // We're about to add a use of V in a loop exit block. Insert an LCSSA phi // (if legal) if doing so would add an out-of-loop use to an instruction @@ -1886,12 +1887,13 @@ public: SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC, MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags, - ICFLoopSafetyInfo &SafetyInfo) + ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)), Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), - SafetyInfo(SafetyInfo) {} + SafetyInfo(SafetyInfo), + CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks) {} bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction *> &) const override { @@ -1903,7 +1905,7 @@ public: return PointerMustAliases.count(Ptr); } - void doExtraRewritesBeforeFinalDeletion() override { + void insertStoresInLoopExitBlocks() { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -1937,10 +1939,21 @@ public: } } + void doExtraRewritesBeforeFinalDeletion() override { + if (CanInsertStoresInExitBlocks) + insertStoresInLoopExitBlocks(); + } + void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); MSSAU->removeMemoryAccess(I); } + + bool shouldDelete(Instruction *I) const override { + if (isa<StoreInst>(I)) + return CanInsertStoresInExitBlocks; + return true; + } }; bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, @@ -2039,6 +2052,7 @@ bool llvm::promoteLoopAccessesToScalars( bool DereferenceableInPH = false; bool SafeToInsertStore = false; + bool FoundLoadToPromote = false; SmallVector<Instruction *, 64> LoopUses; @@ -2067,16 +2081,11 @@ bool llvm::promoteLoopAccessesToScalars( IsKnownThreadLocalObject = !isa<AllocaInst>(Object); } - // Check that all of the pointers in the alias set have the same type. We - // cannot (yet) promote a memory location that is loaded and stored in + // Check that all accesses to pointers in the aliass set use the same type. + // We cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. + Type *AccessTy = nullptr; for (Value *ASIV : PointerMustAliases) { - // Check that all of the pointers in the alias set have the same type. We - // cannot (yet) promote a memory location that is loaded and stored in - // different sizes. - if (SomePtr->getType() != ASIV->getType()) - return false; - for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. Instruction *UI = dyn_cast<Instruction>(U); @@ -2091,6 +2100,7 @@ bool llvm::promoteLoopAccessesToScalars( SawUnorderedAtomic |= Load->isAtomic(); SawNotAtomic |= !Load->isAtomic(); + FoundLoadToPromote = true; Align InstAlignment = Load->getAlign(); @@ -2153,6 +2163,11 @@ bool llvm::promoteLoopAccessesToScalars( } else return false; // Not a load or store. + if (!AccessTy) + AccessTy = getLoadStoreType(UI); + else if (AccessTy != getLoadStoreType(UI)) + return false; + // Merge the AA tags. if (LoopUses.empty()) { // On the first load/store, just take its AA tags. @@ -2175,9 +2190,7 @@ bool llvm::promoteLoopAccessesToScalars( // If we're inserting an atomic load in the preheader, we must be able to // lower it. We're only guaranteed to be able to lower naturally aligned // atomics. - auto *SomePtrElemType = SomePtr->getType()->getPointerElementType(); - if (SawUnorderedAtomic && - Alignment < MDL.getTypeStoreSize(SomePtrElemType)) + if (SawUnorderedAtomic && Alignment < MDL.getTypeStoreSize(AccessTy)) return false; // If we couldn't prove we can hoist the load, bail. @@ -2199,13 +2212,20 @@ bool llvm::promoteLoopAccessesToScalars( } } - // If we've still failed to prove we can sink the store, give up. - if (!SafeToInsertStore) + // If we've still failed to prove we can sink the store, hoist the load + // only, if possible. + if (!SafeToInsertStore && !FoundLoadToPromote) + // If we cannot hoist the load either, give up. return false; - // Otherwise, this is safe to promote, lets do it! - LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr - << '\n'); + // Lets do the promotion! + if (SafeToInsertStore) + LLVM_DEBUG(dbgs() << "LICM: Promoting load/store of the value: " << *SomePtr + << '\n'); + else + LLVM_DEBUG(dbgs() << "LICM: Promoting load of the value: " << *SomePtr + << '\n'); + ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0]) @@ -2224,13 +2244,14 @@ bool llvm::promoteLoopAccessesToScalars( SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL, - Alignment, SawUnorderedAtomic, AATags, *SafetyInfo); + Alignment, SawUnorderedAtomic, AATags, *SafetyInfo, + SafeToInsertStore); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. LoadInst *PreheaderLoad = new LoadInst( - SomePtr->getType()->getPointerElementType(), SomePtr, - SomePtr->getName() + ".promoted", Preheader->getTerminator()); + AccessTy, SomePtr, SomePtr->getName() + ".promoted", + Preheader->getTerminator()); if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 3df4cfe8e4c1..6c783848432b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -49,9 +49,17 @@ void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, LPMUpdater &>::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { - for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) { - auto *P = LoopPasses[Idx].get(); - P->printPipeline(OS, MapClassName2PassName); + assert(LoopPasses.size() + LoopNestPasses.size() == IsLoopNestPass.size()); + + unsigned IdxLP = 0, IdxLNP = 0; + for (unsigned Idx = 0, Size = IsLoopNestPass.size(); Idx != Size; ++Idx) { + if (IsLoopNestPass[Idx]) { + auto *P = LoopNestPasses[IdxLNP++].get(); + P->printPipeline(OS, MapClassName2PassName); + } else { + auto *P = LoopPasses[IdxLP++].get(); + P->printPipeline(OS, MapClassName2PassName); + } if (Idx + 1 < Size) OS << ","; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index a87843d658a9..728d63fe2847 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -256,8 +256,8 @@ private: } } - // Sanity check: amount of dead and live loop blocks should match the total - // number of blocks in loop. + // Amount of dead and live loop blocks should match the total number of + // blocks in loop. assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() && "Malformed block sets?"); @@ -305,7 +305,6 @@ private: BlocksInLoopAfterFolding.insert(BB); } - // Sanity check: header must be in loop. assert(BlocksInLoopAfterFolding.count(L.getHeader()) && "Header not in loop?"); assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() && diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 67702520511b..39c8b65968aa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -806,28 +806,27 @@ static Optional<unsigned> shouldFullUnroll( ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { + assert(FullUnrollTripCount && "should be non-zero!"); - if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { - // When computing the unrolled size, note that BEInsns are not replicated - // like the rest of the loop body. - if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { - return FullUnrollTripCount; + if (FullUnrollTripCount > UP.FullUnrollMaxCount) + return None; - } else { - // The loop isn't that small, but we still can fully unroll it if that - // helps to remove a significant number of instructions. - // To check that, run additional analysis on the loop. - if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, EphValues, TTI, - UP.Threshold * UP.MaxPercentThresholdBoost / 100, - UP.MaxIterationsCountToAnalyze)) { - unsigned Boost = - getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); - if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - return FullUnrollTripCount; - } - } - } + // When computing the unrolled size, note that BEInsns are not replicated + // like the rest of the loop body. + if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) + return FullUnrollTripCount; + + // The loop isn't that small, but we still can fully unroll it if that + // helps to remove a significant number of instructions. + // To check that, run additional analysis on the loop. + if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( + L, FullUnrollTripCount, DT, SE, EphValues, TTI, + UP.Threshold * UP.MaxPercentThresholdBoost / 100, + UP.MaxIterationsCountToAnalyze)) { + unsigned Boost = + getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); + if (Cost->UnrolledCost < UP.Threshold * Boost / 100) + return FullUnrollTripCount; } return None; } @@ -837,51 +836,48 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { + if (!TripCount) + return None; + + if (!UP.Partial) { + LLVM_DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); + return 0; + } unsigned count = UP.Count; - if (TripCount) { - if (!UP.Partial) { - LLVM_DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - count = 0; - return count; - } - if (count == 0) - count = TripCount; - if (UP.PartialThreshold != NoThreshold) { - // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) - count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / - (LoopSize - UP.BEInsns); - if (count > UP.MaxCount) - count = UP.MaxCount; - while (count != 0 && TripCount % count != 0) - count--; - if (UP.AllowRemainder && count <= 1) { - // If there is no Count that is modulo of TripCount, set Count to - // largest power-of-two factor that satisfies the threshold limit. - // As we'll create fixup loop, do the type of unrolling only if - // remainder loop is allowed. - count = UP.DefaultUnrollRuntimeCount; - while (count != 0 && - UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) - count >>= 1; - } - if (count < 2) { - count = 0; - } - } else { - count = TripCount; - } + if (count == 0) + count = TripCount; + if (UP.PartialThreshold != NoThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / + (LoopSize - UP.BEInsns); if (count > UP.MaxCount) count = UP.MaxCount; - - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); - - return count; + while (count != 0 && TripCount % count != 0) + count--; + if (UP.AllowRemainder && count <= 1) { + // If there is no Count that is modulo of TripCount, set Count to + // largest power-of-two factor that satisfies the threshold limit. + // As we'll create fixup loop, do the type of unrolling only if + // remainder loop is allowed. + count = UP.DefaultUnrollRuntimeCount; + while (count != 0 && + UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count >>= 1; + } + if (count < 2) { + count = 0; + } + } else { + count = TripCount; } + if (count > UP.MaxCount) + count = UP.MaxCount; - // if didn't return until here, should continue to other priorties - return None; + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); + + return count; } // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. @@ -900,7 +896,6 @@ bool llvm::computeUnrollCount( TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { UnrollCostEstimator UCE(*L, LoopSize); - Optional<unsigned> UnrollFactor; const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; const bool PragmaFullUnroll = hasUnrollFullPragma(L); @@ -926,9 +921,8 @@ bool llvm::computeUnrollCount( // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. // 2nd priority is unroll count set by pragma. - UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP); - - if (UnrollFactor) { + if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, + UCE, UP)) { UP.Count = *UnrollFactor; if (UserUnrollCount || (PragmaCount > 0)) { @@ -948,11 +942,20 @@ bool llvm::computeUnrollCount( } } - // 3rd priority is full unroll count. - // Full unroll makes sense only when TripCount or its upper bound could be - // statically calculated. - // Also we need to check if we exceed FullUnrollMaxCount. + // 3rd priority is exact full unrolling. This will eliminate all copies + // of some exit test. + UP.Count = 0; + if (TripCount) { + UP.Count = TripCount; + if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, + TripCount, UCE, UP)) { + UP.Count = *UnrollFactor; + UseUpperBound = false; + return ExplicitUnroll; + } + } + // 4th priority is bounded unrolling. // We can unroll by the upper bound amount if it's generally allowed or if // we know that the loop is executed either the upper bound or zero times. // (MaxOrZero unrolling keeps only the first loop test, so the number of @@ -961,37 +964,21 @@ bool llvm::computeUnrollCount( // number of loop tests goes up which may end up being worse on targets with // constrained branch predictor resources so is controlled by an option.) // In addition we only unroll small upper bounds. - unsigned FullUnrollMaxTripCount = MaxTripCount; - if (!(UP.UpperBound || MaxOrZero) || - FullUnrollMaxTripCount > UnrollMaxUpperBound) - FullUnrollMaxTripCount = 0; - - // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only - // compute the former when the latter is zero. - unsigned ExactTripCount = TripCount; - assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && - "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); - - unsigned FullUnrollTripCount = - ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; - UP.Count = FullUnrollTripCount; - - UnrollFactor = - shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP); - - // if shouldFullUnroll can do the unrolling, some side parameteres should be - // set - if (UnrollFactor) { - UP.Count = *UnrollFactor; - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - TripCount = FullUnrollTripCount; - TripMultiple = UP.UpperBound ? 1 : TripMultiple; - return ExplicitUnroll; - } else { - UP.Count = FullUnrollTripCount; + // Note that the cost of bounded unrolling is always strictly greater than + // cost of exact full unrolling. As such, if we have an exact count and + // found it unprofitable, we'll never chose to bounded unroll. + if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && + MaxTripCount <= UnrollMaxUpperBound) { + UP.Count = MaxTripCount; + if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, + MaxTripCount, UCE, UP)) { + UP.Count = *UnrollFactor; + UseUpperBound = true; + return ExplicitUnroll; + } } - // 4th priority is loop peeling. + // 5th priority is loop peeling. computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold); if (PP.PeelCount) { UP.Runtime = false; @@ -1004,11 +991,9 @@ bool llvm::computeUnrollCount( if (TripCount) UP.Partial |= ExplicitUnroll; - // 5th priority is partial unrolling. + // 6th priority is partial unrolling. // Try partial unroll only when TripCount could be statically calculated. - UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP); - - if (UnrollFactor) { + if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) { UP.Count = *UnrollFactor; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && @@ -1049,7 +1034,7 @@ bool llvm::computeUnrollCount( "because loop has a runtime trip count."; }); - // 6th priority is runtime unrolling. + // 7th priority is runtime unrolling. // Don't unroll a runtime trip count loop when it is disabled. if (hasRuntimeUnrollDisablePragma(L)) { UP.Count = 0; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index b0fb8daaba8f..c354fa177a60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -494,7 +494,7 @@ static bool LinearizeExprTree(Instruction *I, SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order. #ifndef NDEBUG - SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme. + SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme. #endif while (!Worklist.empty()) { std::pair<Instruction*, APInt> P = Worklist.pop_back_val(); @@ -2313,11 +2313,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { MadeChange |= LinearizeExprTree(I, Tree); SmallVector<ValueEntry, 8> Ops; Ops.reserve(Tree.size()); - for (unsigned i = 0, e = Tree.size(); i != e; ++i) { - RepeatedValue E = Tree[i]; - Ops.append(E.second.getZExtValue(), - ValueEntry(getRank(E.first), E.first)); - } + for (const RepeatedValue &E : Tree) + Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first)); LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 86d3620c312e..3799d2dd1cf2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -227,8 +227,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, unsigned IterCnt = 0; (void)IterCnt; while (LocalChange) { - assert(IterCnt++ < 1000 && - "Sanity: iterative simplification didn't converge!"); + assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!"); LocalChange = false; // Loop over all of the basic blocks and remove them if they are unneeded. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 6469c899feea..d6d6b1a7fa09 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -235,22 +235,26 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, // These dominator edges will be redirected from Pred. std::vector<DominatorTree::UpdateType> Updates; if (DTU) { - SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB)); + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 8> SeenSuccs; SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB), succ_end(PredBB)); - Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1); + Updates.reserve(Updates.size() + 2 * succ_size(BB) + 1); // Add insert edges first. Experimentally, for the particular case of two // blocks that can be merged, with a single successor and single predecessor // respectively, it is beneficial to have all insert updates first. Deleting // edges first may lead to unreachable blocks, followed by inserting edges // making the blocks reachable again. Such DT updates lead to high compile // times. We add inserts before deletes here to reduce compile time. - for (BasicBlock *SuccOfBB : SuccsOfBB) + for (BasicBlock *SuccOfBB : successors(BB)) // This successor of BB may already be a PredBB's successor. if (!SuccsOfPredBB.contains(SuccOfBB)) - Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB}); - for (BasicBlock *SuccOfBB : SuccsOfBB) - Updates.push_back({DominatorTree::Delete, BB, SuccOfBB}); + if (SeenSuccs.insert(SuccOfBB).second) + Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB}); + SeenSuccs.clear(); + for (BasicBlock *SuccOfBB : successors(BB)) + if (SeenSuccs.insert(SuccOfBB).second) + Updates.push_back({DominatorTree::Delete, BB, SuccOfBB}); Updates.push_back({DominatorTree::Delete, PredBB, BB}); } @@ -804,14 +808,14 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt, if (DTU) { SmallVector<DominatorTree::UpdateType, 8> Updates; // Old dominates New. New node dominates all other nodes dominated by Old. - SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New), - succ_end(New)); + SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld; Updates.push_back({DominatorTree::Insert, Old, New}); - Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size()); - for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) { - Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld}); - Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld}); - } + Updates.reserve(Updates.size() + 2 * succ_size(New)); + for (BasicBlock *SuccessorOfOld : successors(New)) + if (UniqueSuccessorsOfOld.insert(SuccessorOfOld).second) { + Updates.push_back({DominatorTree::Insert, New, SuccessorOfOld}); + Updates.push_back({DominatorTree::Delete, Old, SuccessorOfOld}); + } DTU->applyUpdates(Updates); } else if (DT) @@ -870,14 +874,14 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt, SmallVector<DominatorTree::UpdateType, 8> DTUpdates; // New dominates Old. The predecessor nodes of the Old node dominate // New node. - SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New), - pred_end(New)); + SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld; DTUpdates.push_back({DominatorTree::Insert, New, Old}); - DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size()); - for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) { - DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New}); - DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old}); - } + DTUpdates.reserve(DTUpdates.size() + 2 * pred_size(New)); + for (BasicBlock *PredecessorOfOld : predecessors(New)) + if (UniquePredecessorsOfOld.insert(PredecessorOfOld).second) { + DTUpdates.push_back({DominatorTree::Insert, PredecessorOfOld, New}); + DTUpdates.push_back({DominatorTree::Delete, PredecessorOfOld, Old}); + } DTU->applyUpdates(DTUpdates); @@ -910,13 +914,14 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, } else { // Split block expects NewBB to have a non-empty set of predecessors. SmallVector<DominatorTree::UpdateType, 8> Updates; - SmallPtrSet<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end()); + SmallPtrSet<BasicBlock *, 8> UniquePreds; Updates.push_back({DominatorTree::Insert, NewBB, OldBB}); - Updates.reserve(Updates.size() + 2 * UniquePreds.size()); - for (auto *UniquePred : UniquePreds) { - Updates.push_back({DominatorTree::Insert, UniquePred, NewBB}); - Updates.push_back({DominatorTree::Delete, UniquePred, OldBB}); - } + Updates.reserve(Updates.size() + 2 * Preds.size()); + for (auto *Pred : Preds) + if (UniquePreds.insert(Pred).second) { + Updates.push_back({DominatorTree::Insert, Pred, NewBB}); + Updates.push_back({DominatorTree::Delete, Pred, OldBB}); + } DTU->applyUpdates(Updates); } } else if (DT) { @@ -1376,14 +1381,14 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore, BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); if (DTU) { - SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail), - succ_end(Tail)); + SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead; Updates.push_back({DominatorTree::Insert, Head, Tail}); - Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size()); - for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) { - Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead}); - Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead}); - } + Updates.reserve(Updates.size() + 2 * succ_size(Tail)); + for (BasicBlock *SuccessorOfHead : successors(Tail)) + if (UniqueSuccessorsOfHead.insert(SuccessorOfHead).second) { + Updates.push_back({DominatorTree::Insert, Tail, SuccessorOfHead}); + Updates.push_back({DominatorTree::Delete, Head, SuccessorOfHead}); + } } Instruction *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 957935398972..580cfd80141e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -452,18 +452,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_mempcpy: case LibFunc_memccpy: + Changed |= setWillReturn(F); + LLVM_FALLTHROUGH; + case LibFunc_memcpy_chk: Changed |= setDoesNotThrow(F); Changed |= setOnlyAccessesArgMemory(F); - Changed |= setWillReturn(F); Changed |= setDoesNotAlias(F, 0); Changed |= setOnlyWritesMemory(F, 0); Changed |= setDoesNotAlias(F, 1); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_memcpy_chk: - Changed |= setDoesNotThrow(F); - return Changed; case LibFunc_memalign: Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetNoUndef(F); @@ -1018,9 +1017,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); return Changed; - // TODO: add LibFunc entries for: - // case LibFunc_memset_pattern4: - // case LibFunc_memset_pattern8: + case LibFunc_memset_pattern4: + case LibFunc_memset_pattern8: case LibFunc_memset_pattern16: Changed |= setOnlyAccessesArgMemory(F); Changed |= setDoesNotCapture(F, 0); @@ -1029,10 +1027,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_memset: - Changed |= setOnlyAccessesArgMemory(F); Changed |= setWillReturn(F); - Changed |= setDoesNotThrow(F); + LLVM_FALLTHROUGH; + case LibFunc_memset_chk: + Changed |= setOnlyAccessesArgMemory(F); Changed |= setOnlyWritesMemory(F, 0); + Changed |= setDoesNotThrow(F); return Changed; // int __nvvm_reflect(const char *) case LibFunc_nvvm_reflect: diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp index 200deca4b317..57c273a0e3c5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -135,10 +135,18 @@ std::unique_ptr<Module> llvm::CloneModule( // Similarly, copy over function bodies now... // for (const Function &I : M) { - if (I.isDeclaration()) + Function *F = cast<Function>(VMap[&I]); + + if (I.isDeclaration()) { + // Copy over metadata for declarations since we're not doing it below in + // CloneFunctionInto(). + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; + I.getAllMetadata(MDs); + for (auto MD : MDs) + F->addMetadata(MD.first, *MapMetadata(MD.second, VMap)); continue; + } - Function *F = cast<Function>(VMap[&I]); if (!ShouldCloneDefinition(&I)) { // Skip after setting the correct linkage for an external reference. F->setLinkage(GlobalValue::ExternalLinkage); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp index 4dbcbf80d3da..7c310f16d46e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp @@ -74,7 +74,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, {}, {}, nullptr, "widenable_cond"); CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, "exiplicit_guard_cond")); - assert(isWidenableBranch(CheckBI) && "sanity check"); + assert(isWidenableBranch(CheckBI) && "Branch must be widenable."); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp index f4776589910f..997667810580 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1218,10 +1218,9 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { if (!RI || !isa<CallBase>(RI->getOperand(0))) continue; auto *RetVal = cast<CallBase>(RI->getOperand(0)); - // Sanity check that the cloned RetVal exists and is a call, otherwise we - // cannot add the attributes on the cloned RetVal. - // Simplification during inlining could have transformed the cloned - // instruction. + // Check that the cloned RetVal exists and is a call, otherwise we cannot + // add the attributes on the cloned RetVal. Simplification during inlining + // could have transformed the cloned instruction. auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal)); if (!NewRetVal) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp index 74ab37fadf36..ec926b1f5a94 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp @@ -529,8 +529,8 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive( std::function<void(Value *)> AboutToDeleteCallback) { unsigned S = 0, E = DeadInsts.size(), Alive = 0; for (; S != E; ++S) { - auto *I = cast<Instruction>(DeadInsts[S]); - if (!isInstructionTriviallyDead(I)) { + auto *I = dyn_cast<Instruction>(DeadInsts[S]); + if (!I || !isInstructionTriviallyDead(I)) { DeadInsts[S] = nullptr; ++Alive; } @@ -760,15 +760,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, SmallVector<DominatorTree::UpdateType, 32> Updates; if (DTU) { - SmallPtrSet<BasicBlock *, 2> PredsOfPredBB(pred_begin(PredBB), - pred_end(PredBB)); - Updates.reserve(Updates.size() + 2 * PredsOfPredBB.size() + 1); - for (BasicBlock *PredOfPredBB : PredsOfPredBB) + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 2> SeenPreds; + Updates.reserve(Updates.size() + 2 * pred_size(PredBB) + 1); + for (BasicBlock *PredOfPredBB : predecessors(PredBB)) // This predecessor of PredBB may already have DestBB as a successor. if (PredOfPredBB != PredBB) - Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB}); - for (BasicBlock *PredOfPredBB : PredsOfPredBB) - Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB}); + if (SeenPreds.insert(PredOfPredBB).second) + Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB}); + SeenPreds.clear(); + for (BasicBlock *PredOfPredBB : predecessors(PredBB)) + if (SeenPreds.insert(PredOfPredBB).second) + Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB}); Updates.push_back({DominatorTree::Delete, PredBB, DestBB}); } @@ -1096,16 +1099,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, SmallVector<DominatorTree::UpdateType, 32> Updates; if (DTU) { + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 8> SeenPreds; // All predecessors of BB will be moved to Succ. - SmallPtrSet<BasicBlock *, 8> PredsOfBB(pred_begin(BB), pred_end(BB)); SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ)); - Updates.reserve(Updates.size() + 2 * PredsOfBB.size() + 1); - for (auto *PredOfBB : PredsOfBB) + Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1); + for (auto *PredOfBB : predecessors(BB)) // This predecessor of BB may already have Succ as a successor. if (!PredsOfSucc.contains(PredOfBB)) - Updates.push_back({DominatorTree::Insert, PredOfBB, Succ}); - for (auto *PredOfBB : PredsOfBB) - Updates.push_back({DominatorTree::Delete, PredOfBB, BB}); + if (SeenPreds.insert(PredOfBB).second) + Updates.push_back({DominatorTree::Insert, PredOfBB, Succ}); + SeenPreds.clear(); + for (auto *PredOfBB : predecessors(BB)) + if (SeenPreds.insert(PredOfBB).second) + Updates.push_back({DominatorTree::Delete, PredOfBB, BB}); Updates.push_back({DominatorTree::Delete, BB, Succ}); } @@ -2190,26 +2197,6 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}}); } -void llvm::createUnreachableSwitchDefault(SwitchInst *Switch, - DomTreeUpdater *DTU) { - LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); - auto *BB = Switch->getParent(); - auto *OrigDefaultBlock = Switch->getDefaultDest(); - OrigDefaultBlock->removePredecessor(BB); - BasicBlock *NewDefaultBlock = BasicBlock::Create( - BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), - OrigDefaultBlock); - new UnreachableInst(Switch->getContext(), NewDefaultBlock); - Switch->setDefaultDest(&*NewDefaultBlock); - if (DTU) { - SmallVector<DominatorTree::UpdateType, 2> Updates; - Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock}); - if (!is_contained(successors(BB), OrigDefaultBlock)) - Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock}); - DTU->applyUpdates(Updates); - } -} - BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, BasicBlock *UnwindEdge, DomTreeUpdater *DTU) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a92cb6a313d3..bb719a499a4c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -623,15 +623,13 @@ bool llvm::UnrollRuntimeLoopRemainder( if (!SE) return false; - // Only unroll loops with a computable trip count, and the trip count needs - // to be an int value (allowing a pointer type is a TODO item). + // Only unroll loops with a computable trip count. // We calculate the backedge count by using getExitCount on the Latch block, // which is proven to be the only exiting block in this loop. This is same as // calculating getBackedgeTakenCount on the loop (which computes SCEV for all // exiting blocks). const SCEV *BECountSC = SE->getExitCount(L, Latch); - if (isa<SCEVCouldNotCompute>(BECountSC) || - !BECountSC->getType()->isIntegerTy()) { + if (isa<SCEVCouldNotCompute>(BECountSC)) { LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); return false; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp index 68572d479742..c8e42acdffb3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, return Builder.CreateOrReduce(Src); case RecurKind::Xor: return Builder.CreateXorReduce(Src); + case RecurKind::FMulAdd: case RecurKind::FAdd: return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), Src); @@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, Value *llvm::createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start) { - assert(Desc.getRecurrenceKind() == RecurKind::FAdd && + assert((Desc.getRecurrenceKind() == RecurKind::FAdd || + Desc.getRecurrenceKind() == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 5893ce15b129..7d9992176658 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -446,6 +446,9 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. for (Instruction *User : Insts) { + if (!shouldDelete(User)) + continue; + // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp new file mode 100644 index 000000000000..9495e442e0bf --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -0,0 +1,462 @@ +//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a profile inference algorithm. Given an incomplete and +// possibly imprecise block counts, the algorithm reconstructs realistic block +// and edge counts that satisfy flow conservation rules, while minimally modify +// input block counts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SampleProfileInference.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <set> + +using namespace llvm; +#define DEBUG_TYPE "sample-profile-inference" + +namespace { + +/// A value indicating an infinite flow/capacity/weight of a block/edge. +/// Not using numeric_limits<int64_t>::max(), as the values can be summed up +/// during the execution. +static constexpr int64_t INF = ((int64_t)1) << 50; + +/// The minimum-cost maximum flow algorithm. +/// +/// The algorithm finds the maximum flow of minimum cost on a given (directed) +/// network using a modified version of the classical Moore-Bellman-Ford +/// approach. The algorithm applies a number of augmentation iterations in which +/// flow is sent along paths of positive capacity from the source to the sink. +/// The worst-case time complexity of the implementation is O(v(f)*m*n), where +/// where m is the number of edges, n is the number of vertices, and v(f) is the +/// value of the maximum flow. However, the observed running time on typical +/// instances is sub-quadratic, that is, o(n^2). +/// +/// The input is a set of edges with specified costs and capacities, and a pair +/// of nodes (source and sink). The output is the flow along each edge of the +/// minimum total cost respecting the given edge capacities. +class MinCostMaxFlow { +public: + // Initialize algorithm's data structures for a network of a given size. + void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { + Source = SourceNode; + Target = SinkNode; + + Nodes = std::vector<Node>(NodeCount); + Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>()); + } + + // Run the algorithm. + int64_t run() { + // Find an augmenting path and update the flow along the path + size_t AugmentationIters = 0; + while (findAugmentingPath()) { + augmentFlowAlongPath(); + AugmentationIters++; + } + + // Compute the total flow and its cost + int64_t TotalCost = 0; + int64_t TotalFlow = 0; + for (uint64_t Src = 0; Src < Nodes.size(); Src++) { + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) { + TotalCost += Edge.Cost * Edge.Flow; + if (Src == Source) + TotalFlow += Edge.Flow; + } + } + } + LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters + << " iterations with " << TotalFlow << " total flow" + << " of " << TotalCost << " cost\n"); + (void)TotalFlow; + return TotalCost; + } + + /// Adding an edge to the network with a specified capacity and a cost. + /// Multiple edges between a pair of nodes are allowed but self-edges + /// are not supported. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { + assert(Capacity > 0 && "adding an edge of zero capacity"); + assert(Src != Dst && "loop edge are not supported"); + + Edge SrcEdge; + SrcEdge.Dst = Dst; + SrcEdge.Cost = Cost; + SrcEdge.Capacity = Capacity; + SrcEdge.Flow = 0; + SrcEdge.RevEdgeIndex = Edges[Dst].size(); + + Edge DstEdge; + DstEdge.Dst = Src; + DstEdge.Cost = -Cost; + DstEdge.Capacity = 0; + DstEdge.Flow = 0; + DstEdge.RevEdgeIndex = Edges[Src].size(); + + Edges[Src].push_back(SrcEdge); + Edges[Dst].push_back(DstEdge); + } + + /// Adding an edge to the network of infinite capacity and a given cost. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { + addEdge(Src, Dst, INF, Cost); + } + + /// Get the total flow from a given source node. + /// Returns a list of pairs (target node, amount of flow to the target). + const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const { + std::vector<std::pair<uint64_t, int64_t>> Flow; + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) + Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); + } + return Flow; + } + + /// Get the total flow between a pair of nodes. + int64_t getFlow(uint64_t Src, uint64_t Dst) const { + int64_t Flow = 0; + for (auto &Edge : Edges[Src]) { + if (Edge.Dst == Dst) { + Flow += Edge.Flow; + } + } + return Flow; + } + + /// A cost of increasing a block's count by one. + static constexpr int64_t AuxCostInc = 10; + /// A cost of decreasing a block's count by one. + static constexpr int64_t AuxCostDec = 20; + /// A cost of increasing a count of zero-weight block by one. + static constexpr int64_t AuxCostIncZero = 11; + /// A cost of increasing the entry block's count by one. + static constexpr int64_t AuxCostIncEntry = 40; + /// A cost of decreasing the entry block's count by one. + static constexpr int64_t AuxCostDecEntry = 10; + /// A cost of taking an unlikely jump. + static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; + +private: + /// Check for existence of an augmenting path with a positive capacity. + bool findAugmentingPath() { + // Initialize data structures + for (auto &Node : Nodes) { + Node.Distance = INF; + Node.ParentNode = uint64_t(-1); + Node.ParentEdgeIndex = uint64_t(-1); + Node.Taken = false; + } + + std::queue<uint64_t> Queue; + Queue.push(Source); + Nodes[Source].Distance = 0; + Nodes[Source].Taken = true; + while (!Queue.empty()) { + uint64_t Src = Queue.front(); + Queue.pop(); + Nodes[Src].Taken = false; + // Although the residual network contains edges with negative costs + // (in particular, backward edges), it can be shown that there are no + // negative-weight cycles and the following two invariants are maintained: + // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, + // where Dist is the length of the shortest path between two nodes. This + // allows to prune the search-space of the path-finding algorithm using + // the following early-stop criteria: + // -- If we find a path with zero-distance from Source to Target, stop the + // search, as the path is the shortest since Dist[Source, Target] >= 0; + // -- If we have Dist[Source, V] > Dist[Source, Target], then do not + // process node V, as it is guaranteed _not_ to be on a shortest path + // from Source to Target; it follows from inequalities + // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] + // >= Dist[Source, V] + if (Nodes[Target].Distance == 0) + break; + if (Nodes[Src].Distance > Nodes[Target].Distance) + continue; + + // Process adjacent edges + for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { + auto &Edge = Edges[Src][EdgeIdx]; + if (Edge.Flow < Edge.Capacity) { + uint64_t Dst = Edge.Dst; + int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; + if (Nodes[Dst].Distance > NewDistance) { + // Update the distance and the parent node/edge + Nodes[Dst].Distance = NewDistance; + Nodes[Dst].ParentNode = Src; + Nodes[Dst].ParentEdgeIndex = EdgeIdx; + // Add the node to the queue, if it is not there yet + if (!Nodes[Dst].Taken) { + Queue.push(Dst); + Nodes[Dst].Taken = true; + } + } + } + } + } + + return Nodes[Target].Distance != INF; + } + + /// Update the current flow along the augmenting path. + void augmentFlowAlongPath() { + // Find path capacity + int64_t PathCapacity = INF; + uint64_t Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow); + Now = Pred; + } + + assert(PathCapacity > 0 && "found incorrect augmenting path"); + + // Update the flow along the path + Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; + + Edge.Flow += PathCapacity; + RevEdge.Flow -= PathCapacity; + + Now = Pred; + } + } + + /// An node in a flow network. + struct Node { + /// The cost of the cheapest path from the source to the current node. + int64_t Distance; + /// The node preceding the current one in the path. + uint64_t ParentNode; + /// The index of the edge between ParentNode and the current node. + uint64_t ParentEdgeIndex; + /// An indicator of whether the current node is in a queue. + bool Taken; + }; + /// An edge in a flow network. + struct Edge { + /// The cost of the edge. + int64_t Cost; + /// The capacity of the edge. + int64_t Capacity; + /// The current flow on the edge. + int64_t Flow; + /// The destination node of the edge. + uint64_t Dst; + /// The index of the reverse edge between Dst and the current node. + uint64_t RevEdgeIndex; + }; + + /// The set of network nodes. + std::vector<Node> Nodes; + /// The set of network edges. + std::vector<std::vector<Edge>> Edges; + /// Source node of the flow. + uint64_t Source; + /// Target (sink) node of the flow. + uint64_t Target; +}; + +/// Initializing flow network for a given function. +/// +/// Every block is split into three nodes that are responsible for (i) an +/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or +/// reduction of the block weight. +void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + assert(NumBlocks > 1 && "Too few blocks in a function"); + LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n"); + + // Pre-process data: make sure the entry weight is at least 1 + if (Func.Blocks[Func.Entry].Weight == 0) { + Func.Blocks[Func.Entry].Weight = 1; + } + // Introducing dummy source/sink pairs to allow flow circulation. + // The nodes corresponding to blocks of Func have indicies in the range + // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values. + uint64_t S = 3 * NumBlocks; + uint64_t T = S + 1; + uint64_t S1 = S + 2; + uint64_t T1 = S + 3; + + Network.initialize(3 * NumBlocks + 4, S1, T1); + + // Create three nodes for every block of the function + for (uint64_t B = 0; B < NumBlocks; B++) { + auto &Block = Func.Blocks[B]; + assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) && + "non-zero weight of a block w/o weight except for an entry"); + + // Split every block into two nodes + uint64_t Bin = 3 * B; + uint64_t Bout = 3 * B + 1; + uint64_t Baux = 3 * B + 2; + if (Block.Weight > 0) { + Network.addEdge(S1, Bout, Block.Weight, 0); + Network.addEdge(Bin, T1, Block.Weight, 0); + } + + // Edges from S and to T + assert((!Block.isEntry() || !Block.isExit()) && + "a block cannot be an entry and an exit"); + if (Block.isEntry()) { + Network.addEdge(S, Bin, 0); + } else if (Block.isExit()) { + Network.addEdge(Bout, T, 0); + } + + // An auxiliary node to allow increase/reduction of block counts: + // We assume that decreasing block counts is more expensive than increasing, + // and thus, setting separate costs here. In the future we may want to tune + // the relative costs so as to maximize the quality of generated profiles. + int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc; + int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec; + if (Block.UnknownWeight) { + // Do not penalize changing weights of blocks w/o known profile count + AuxCostInc = 0; + AuxCostDec = 0; + } else { + // Increasing the count for "cold" blocks with zero initial count is more + // expensive than for "hot" ones + if (Block.Weight == 0) { + AuxCostInc = MinCostMaxFlow::AuxCostIncZero; + } + // Modifying the count of the entry block is expensive + if (Block.isEntry()) { + AuxCostInc = MinCostMaxFlow::AuxCostIncEntry; + AuxCostDec = MinCostMaxFlow::AuxCostDecEntry; + } + } + // For blocks with self-edges, do not penalize a reduction of the count, + // as all of the increase can be attributed to the self-edge + if (Block.HasSelfEdge) { + AuxCostDec = 0; + } + + Network.addEdge(Bin, Baux, AuxCostInc); + Network.addEdge(Baux, Bout, AuxCostInc); + if (Block.Weight > 0) { + Network.addEdge(Bout, Baux, AuxCostDec); + Network.addEdge(Baux, Bin, AuxCostDec); + } + } + + // Creating edges for every jump + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0; + Network.addEdge(SrcOut, DstIn, Cost); + } + } + + // Make sure we have a valid flow circulation + Network.addEdge(T, S, 0); +} + +/// Extract resulting block and edge counts from the flow network. +void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + + // Extract resulting block counts + for (uint64_t Src = 0; Src < NumBlocks; Src++) { + auto &Block = Func.Blocks[Src]; + uint64_t SrcOut = 3 * Src + 1; + int64_t Flow = 0; + for (auto &Adj : Network.getFlow(SrcOut)) { + uint64_t DstIn = Adj.first; + int64_t DstFlow = Adj.second; + bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2); + if (!IsAuxNode || Block.HasSelfEdge) { + Flow += DstFlow; + } + } + Block.Flow = Flow; + assert(Flow >= 0 && "negative block flow"); + } + + // Extract resulting jump counts + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + int64_t Flow = 0; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + Flow = Network.getFlow(SrcOut, DstIn); + } else { + uint64_t SrcOut = 3 * Src + 1; + uint64_t SrcAux = 3 * Src + 2; + int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux); + if (AuxFlow > 0) + Flow = AuxFlow; + } + Jump.Flow = Flow; + assert(Flow >= 0 && "negative jump flow"); + } +} + +#ifndef NDEBUG +/// Verify that the computed flow values satisfy flow conservation rules +void verifyWeights(const FlowFunction &Func) { + const uint64_t NumBlocks = Func.Blocks.size(); + auto InFlow = std::vector<uint64_t>(NumBlocks, 0); + auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); + for (auto &Jump : Func.Jumps) { + InFlow[Jump.Target] += Jump.Flow; + OutFlow[Jump.Source] += Jump.Flow; + } + + uint64_t TotalInFlow = 0; + uint64_t TotalOutFlow = 0; + for (uint64_t I = 0; I < NumBlocks; I++) { + auto &Block = Func.Blocks[I]; + if (Block.isEntry()) { + TotalInFlow += Block.Flow; + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + } else if (Block.isExit()) { + TotalOutFlow += Block.Flow; + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } else { + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } + } + assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); +} +#endif + +} // end of anonymous namespace + +/// Apply the profile inference algorithm for a given flow function +void llvm::applyFlowInference(FlowFunction &Func) { + // Create and apply an inference network model + auto InferenceNetwork = MinCostMaxFlow(); + initializeNetwork(InferenceNetwork, Func); + InferenceNetwork.run(); + + // Extract flow values for every block and every edge + extractWeights(InferenceNetwork, Func); + +#ifndef NDEBUG + // Verify the result + verifyWeights(Func); +#endif +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp index 6d995cf4c048..ea0e8343eb88 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp @@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused( cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. ")); +cl::opt<bool> SampleProfileUseProfi( + "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Use profi to infer block and edge counts.")); + namespace sampleprofutil { /// Return true if the given callsite is hot wrt to hot cutoff threshold. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index a042146d7ace..71c15d5c51fc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" @@ -1833,22 +1834,6 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) { return V; } -/// Check whether value has nuw/nsw/exact set but SCEV does not. -/// TODO: In reality it is better to check the poison recursively -/// but this is better than nothing. -static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) { - if (isa<OverflowingBinaryOperator>(I)) { - if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) { - if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap()) - return true; - if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap()) - return true; - } - } else if (isa<PossiblyExactOperator>(I) && I->isExact()) - return true; - return false; -} - ScalarEvolution::ValueOffsetPair SCEVExpander::FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt) { @@ -1872,8 +1857,7 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S, if (S->getType() == V->getType() && SE.DT.dominates(EntInst, InsertPt) && (SE.LI.getLoopFor(EntInst->getParent()) == nullptr || - SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) && - !SCEVLostPoisonFlags(S, EntInst)) + SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt))) return {V, Offset}; } } @@ -1952,26 +1936,36 @@ Value *SCEVExpander::expand(const SCEV *S) { if (!V) V = visit(S); - else if (VO.second) { - if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { - Type *Ety = Vty->getPointerElementType(); - int64_t Offset = VO.second->getSExtValue(); - int64_t ESize = SE.getTypeSizeInBits(Ety); - if ((Offset * 8) % ESize == 0) { - ConstantInt *Idx = + else { + // If we're reusing an existing instruction, we are effectively CSEing two + // copies of the instruction (with potentially different flags). As such, + // we need to drop any poison generating flags unless we can prove that + // said flags must be valid for all new users. + if (auto *I = dyn_cast<Instruction>(V)) + if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)) + I->dropPoisonGeneratingFlags(); + + if (VO.second) { + if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { + Type *Ety = Vty->getPointerElementType(); + int64_t Offset = VO.second->getSExtValue(); + int64_t ESize = SE.getTypeSizeInBits(Ety); + if ((Offset * 8) % ESize == 0) { + ConstantInt *Idx = ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); - V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); - } else { - ConstantInt *Idx = + V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); + } else { + ConstantInt *Idx = ConstantInt::getSigned(VO.second->getType(), -Offset); - unsigned AS = Vty->getAddressSpace(); - V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); - V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, - "uglygep"); - V = Builder.CreateBitCast(V, Vty); + unsigned AS = Vty->getAddressSpace(); + V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); + V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, + "uglygep"); + V = Builder.CreateBitCast(V, Vty); + } + } else { + V = Builder.CreateSub(V, VO.second); } - } else { - V = Builder.CreateSub(V, VO.second); } } // Remember the expanded value for this SCEV at this location. @@ -2180,7 +2174,9 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At, } // Use expand's logic which is used for reusing a previous Value in - // ExprValueMap. + // ExprValueMap. Note that we don't currently model the cost of + // needing to drop poison generating flags on the instruction if we + // want to reuse it. We effectively assume that has zero cost. ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At); if (VO.first) return VO; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index f467de5f924e..afa3ecde77f9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3936,7 +3936,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm, BasicBlock *KeepEdge1 = TrueBB; BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; - SmallPtrSet<BasicBlock *, 2> RemovedSuccessors; + SmallSetVector<BasicBlock *, 2> RemovedSuccessors; // Then remove the rest. for (BasicBlock *Succ : successors(OldTerm)) { @@ -4782,6 +4782,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { return true; } +static void createUnreachableSwitchDefault(SwitchInst *Switch, + DomTreeUpdater *DTU) { + LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); + auto *BB = Switch->getParent(); + auto *OrigDefaultBlock = Switch->getDefaultDest(); + OrigDefaultBlock->removePredecessor(BB); + BasicBlock *NewDefaultBlock = BasicBlock::Create( + BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), + OrigDefaultBlock); + new UnreachableInst(Switch->getContext(), NewDefaultBlock); + Switch->setDefaultDest(&*NewDefaultBlock); + if (DTU) { + SmallVector<DominatorTree::UpdateType, 2> Updates; + Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock}); + if (!is_contained(successors(BB), OrigDefaultBlock)) + Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock}); + DTU->applyUpdates(Updates); + } +} + /// Turn a switch with two reachable destinations into an integer range /// comparison and branch. bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI, @@ -4927,10 +4947,14 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, // Gather dead cases. SmallVector<ConstantInt *, 8> DeadCases; SmallDenseMap<BasicBlock *, int, 8> NumPerSuccessorCases; + SmallVector<BasicBlock *, 8> UniqueSuccessors; for (auto &Case : SI->cases()) { auto *Successor = Case.getCaseSuccessor(); - if (DTU) + if (DTU) { + if (!NumPerSuccessorCases.count(Successor)) + UniqueSuccessors.push_back(Successor); ++NumPerSuccessorCases[Successor]; + } const APInt &CaseVal = Case.getCaseValue()->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) { @@ -4973,9 +4997,9 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, if (DTU) { std::vector<DominatorTree::UpdateType> Updates; - for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases) - if (I.second == 0) - Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first}); + for (auto *Successor : UniqueSuccessors) + if (NumPerSuccessorCases[Successor] == 0) + Updates.push_back({DominatorTree::Delete, SI->getParent(), Successor}); DTU->applyUpdates(Updates); } @@ -6040,15 +6064,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (Succ == SI->getDefaultDest()) continue; Succ->removePredecessor(BB); - RemovedSuccessors.insert(Succ); + if (DTU && RemovedSuccessors.insert(Succ).second) + Updates.push_back({DominatorTree::Delete, BB, Succ}); } SI->eraseFromParent(); - if (DTU) { - for (BasicBlock *RemovedSuccessor : RemovedSuccessors) - Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor}); + if (DTU) DTU->applyUpdates(Updates); - } ++NumLookupTables; if (NeedMask) @@ -6215,7 +6237,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { // Eliminate redundant destinations. SmallPtrSet<Value *, 8> Succs; - SmallPtrSet<BasicBlock *, 8> RemovedSuccs; + SmallSetVector<BasicBlock *, 8> RemovedSuccs; for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { BasicBlock *Dest = IBI->getDestination(i); if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) { @@ -6305,8 +6327,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, // We've found an identical block. Update our predecessors to take that // path instead and make ourselves dead. - SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB)); - for (BasicBlock *Pred : Preds) { + SmallSetVector<BasicBlock *, 16> UniquePreds(pred_begin(BB), pred_end(BB)); + for (BasicBlock *Pred : UniquePreds) { InvokeInst *II = cast<InvokeInst>(Pred->getTerminator()); assert(II->getNormalDest() != BB && II->getUnwindDest() == BB && "unexpected successor"); @@ -6323,8 +6345,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, if (isa<DbgInfoIntrinsic>(Inst)) Inst.eraseFromParent(); - SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB)); - for (BasicBlock *Succ : Succs) { + SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB)); + for (BasicBlock *Succ : UniqueSuccs) { Succ->removePredecessor(BB); if (DTU) Updates.push_back({DominatorTree::Delete, BB, Succ}); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23bb6f0860c9..5ca0adb4242c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -473,18 +473,10 @@ public: /// handle the more complex control flow around the loops. virtual BasicBlock *createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, - VPTransformState &State); - /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); - /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, - bool InvariantCond, VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State); @@ -496,12 +488,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single GetElementPtrInst based on information gathered and - /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, - unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); - /// Vectorize a single first-order recurrence or pointer induction PHINode in /// a block. This method handles the induction variable canonicalization. It /// supports both VF = 1 for unrolled loops and arbitrary length vectors. @@ -511,9 +497,9 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -538,15 +524,6 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Vectorize Load and Store instructions with the base address given in \p - /// Addr, optionally masking the vector operations if \p BlockInMask is - /// non-null. Use \p State to translate given VPValues to IR values in the - /// vectorized loop. - void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, - bool ConsecutiveStride, bool Reverse); - /// Set the debug location in the builder \p Ptr using the debug location in /// \p V. If \p Ptr is None then it uses the class member's Builder. void setDebugLocFromInst(const Value *V, @@ -566,6 +543,17 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, Instruction *From); + + /// Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(ArrayRef<Value *> To, Instruction *From); + protected: friend class LoopVectorizationPlanner; @@ -741,16 +729,16 @@ protected: /// vector loop. void addNewMetadata(Instruction *To, const Instruction *Orig); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Collect poison-generating recipes that may generate a poison value that is + /// used after vectorization, even when their operands are not poison. Those + /// recipes meet the following conditions: + /// * Contribute to the address computation of a recipe generating a widen + /// memory load/store (VPWidenMemoryInstructionRecipe or + /// VPInterleaveRecipe). + /// * Such a widen memory load/store has at least one underlying Instruction + /// that is in a basic block that needs predication and after vectorization + /// the generated instruction won't be predicated. + void collectPoisonGeneratingRecipes(VPTransformState &State); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. @@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To, LVer->annotateInstWithNoAlias(To, Orig); } +void InnerLoopVectorizer::collectPoisonGeneratingRecipes( + VPTransformState &State) { + + // Collect recipes in the backward slice of `Root` that may generate a poison + // value that is used after vectorization. + SmallPtrSet<VPRecipeBase *, 16> Visited; + auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { + SmallVector<VPRecipeBase *, 16> Worklist; + Worklist.push_back(Root); + + // Traverse the backward slice of Root through its use-def chain. + while (!Worklist.empty()) { + VPRecipeBase *CurRec = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(CurRec).second) + continue; + + // Prune search if we find another recipe generating a widen memory + // instruction. Widen memory instructions involved in address computation + // will lead to gather/scatter instructions, which don't need to be + // handled. + if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || + isa<VPInterleaveRecipe>(CurRec)) + continue; + + // This recipe contributes to the address computation of a widen + // load/store. Collect recipe if its underlying instruction has + // poison-generating flags. + Instruction *Instr = CurRec->getUnderlyingInstr(); + if (Instr && Instr->hasPoisonGeneratingFlags()) + State.MayGeneratePoisonRecipes.insert(CurRec); + + // Add new definitions to the worklist. + for (VPValue *operand : CurRec->operands()) + if (VPDef *OpDef = operand->getDef()) + Worklist.push_back(cast<VPRecipeBase>(OpDef)); + } + }); + + // Traverse all the recipes in the VPlan and collect the poison-generating + // recipes in the backward slice starting at the address of a VPWidenRecipe or + // VPInterleaveRecipe. + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &Recipe : *VPBB) { + if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { + Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + VPDef *AddrDef = WidenRec->getAddr()->getDef(); + if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && + Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { + VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); + if (AddrDef) { + // Check if any member of the interleave group needs predication. + const InterleaveGroup<Instruction> *InterGroup = + InterleaveRec->getInterleaveGroup(); + bool NeedPredication = false; + for (int I = 0, NumMembers = InterGroup->getNumMembers(); + I < NumMembers; ++I) { + Instruction *Member = InterGroup->getMember(I); + if (Member) + NeedPredication |= + Legal->blockNeedsPredication(Member->getParent()); + } + + if (NeedPredication) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } + } + } + } +} + void InnerLoopVectorizer::addMetadata(Instruction *To, Instruction *From) { propagateMetadata(To, From); @@ -1541,7 +1607,16 @@ public: // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + // When we know the load is uniform and the original scalar loop was not + // predicated we don't need to mark it as a predicated instruction. Any + // vectorised blocks created when tail-folding are something artificial we + // have introduced and we know there is always at least one active lane. + // That's why we call Legal->blockNeedsPredication here because it doesn't + // query tail-folding. + if (IsKnownUniform && isa<LoadInst>(I) && + !Legal->blockNeedsPredication(I->getParent())) + return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated @@ -1816,9 +1891,11 @@ private: /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. + /// during vectorization. collectLoopScalars should only add non-uniform nodes + /// to the list if they are used by a load/store instruction that is marked as + /// CM_Scalarize. Non-uniform scalarized instructions will be represented by + /// VF values in the vectorized loop, each corresponding to an iteration of + /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. @@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction( - Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, - bool Reverse) { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); - - Type *ScalarDataTy = getLoadStoreType(Instr); - - auto *DataTy = VectorType::get(ScalarDataTy, VF); - const Align Alignment = getLoadStoreAlignment(Instr); - bool CreateGatherScatter = !ConsecutiveStride; - - VectorParts BlockInMaskParts(UF); - bool isMaskRequired = BlockInMask; - if (isMaskRequired) - for (unsigned Part = 0; Part < UF; ++Part) - BlockInMaskParts[Part] = State.get(BlockInMask, Part); - - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { - // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); - PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); - } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); - } - - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - }; - - // Handle Stores: - if (SI) { - setDebugLocFromInst(SI); - - for (unsigned Part = 0; Part < UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = reverseVector(StoredVal); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - addMetadata(NewSI, SI); - } - return; - } - - // Handle loads. - assert(LI && "Must have a load instruction"); - setDebugLocFromInst(LI); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *NewLI; - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); - addMetadata(NewLI, LI); - } else { - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - addMetadata(NewLI, LI); - if (Reverse) - NewLI = reverseVector(NewLI); - } - - State.set(Def, NewLI, Part); - } -} - -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store which was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized + // instruction could feed a poison value to the base address of the widen + // load/store. + if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + Cloned->dropPoisonGeneratingFlags(); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<AssumeInst>(Cloned)) @@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, - VPUser &Operands, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, - VPTransformState &State) { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, EntryPart, Part); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : State.get(Operands.getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { - VPValue *Operand = Operands.getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(VPDef, NewGEP, Part); - addMetadata(NewGEP, GEP); - } - } -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, VPTransformState &State) { @@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); - - bool NeedsVectorIndex = !IsUniform && VF.isScalable(); - Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; - if (NeedsVectorIndex) { - Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); - UnitStepVec = Builder.CreateStepVector(VecIVTy); - PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); - } + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF(Builder, PtrInd->getType(), VF, Part); - if (NeedsVectorIndex) { - // Here we cache the whole vector, which means we can support the - // extraction of any lane. However, in some cases the extractelement - // instruction that is generated for scalar uses of this vector (e.g. - // a load instruction) is not folded away. Therefore we still - // calculate values for the first n lanes to avoid redundant moves - // (when extracting the 0th element) and to produce scalar code (i.e. - // additional add/gep instructions instead of expensive extractelement - // instructions) when extracting higher-order elements. - Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); - Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); - Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, Part); - } - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); @@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, - VPUser &User, - VPTransformState &State) { - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : User.operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) - VecOp->copyIRFlags(&I); - - // Use this vector value for all users of the original instruction. - State.set(Def, V, Part); - addMetadata(V, &I); - } - - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *B = State.get(User.getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(Def, C, Part); - addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, Cast, Part); - addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State) { @@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, - VPUser &Operands, - bool InvariantCond, - VPTransformState &State) { - setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = InvariantCond - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : nullptr; - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = - InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); - Value *Op0 = State.get(Operands.getOperand(1), Part); - Value *Op1 = State.get(Operands.getOperand(2), Part); - Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, Sel, Part); - addMetadata(Sel, &I); - } -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa<PHINode>(Ptr) || - !Legal->getInductionVars().count(cast<PHINode>(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - - // A helper that evaluates a memory access's use of a pointer. If the - // pointer is actually the pointer induction of a loop, it is being - // inserted into Worklist. If the use will be a scalar use, and the - // pointer is only used by memory accesses, we place the pointer in - // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. + // A helper that evaluates a memory access's use of a pointer. If the use will + // be a scalar use and the pointer is only used by memory accesses, we place + // the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast<Instruction>(Ptr)); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - - // If there is more than one user of Update (Ptr), we shouldn't assume it - // will be scalar after vectorisation as other users of the instruction - // may require widening. Otherwise, add it to ScalarPtrs. - if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { - ScalarPtrs.insert(Update); - return; - } - } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; + // Returns true if \p Indvar is a pointer induction that is used directly by + // load/store instruction \p I. + auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, + Instruction *I) { + return Induction.second.getKind() == + InductionDescriptor::IK_PtrInduction && + (isa<LoadInst>(I) || isa<StoreInst>(I)) && + Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + }; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; @@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; @@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` + // that it is being called from this specific place. // Figure out whether the access is strided and get the stride value // if it's known in compile time @@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + // For a call to the llvm.fmuladd intrinsic we need to add the cost of a + // normal fmul instruction to the cost of the fadd reduction. + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + BaseCost += + TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); + // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. @@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); @@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "intermediate fn:\n" + << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + Range); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (VPBB) VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); else { - Plan->setEntry(FirstVPBBForBB); + auto *TopRegion = new VPRegionBlock("vector loop"); + TopRegion->setEntry(FirstVPBBForBB); + Plan->setEntry(TopRegion); HeaderVPBB = FirstVPBBForBB; } VPBB = FirstVPBBForBB; @@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - assert(isa<VPBasicBlock>(Plan->getEntry()) && + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && - "entry block must be set to a non-empty VPBasicBlock"); + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && + assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || + (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + + if (IsFMulAdd) { + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe to use as the vector operand for the + // fadd reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); + FMulRecipe->setFastMathFlags(R->getFastMathFlags()); + WidenRecipe->getParent()->insert(FMulRecipe, + WidenRecipe->getIterator()); + VecOp = FMulRecipe; + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), - this, *this, InvariantCond, State); + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.ILV->setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.ILV->addMetadata(Sel, &I); + } } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.ILV->setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.count(this) > 0) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.ILV->addMetadata(V, &I); + } + + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.ILV->setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.ILV->addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.ILV->setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.ILV->addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, - *this, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant, State); + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.ILV->addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = IsInBounds + ? State.Builder.CreateInBoundsGEP( + GEP->getSourceElementType(), Ptr, Indices) + : State.Builder.CreateGEP(GEP->getSourceElementType(), + Ptr, Indices); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.ILV->addMetadata(NewGEP, GEP); + } + } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } @@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction( - &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), - StoredValue, getMask(), Consecutive, Reverse); + + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); + StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); + + assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGatherScatter = !Consecutive; + + auto &Builder = State.Builder; + InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); + bool isMaskRequired = getMask(); + if (isMaskRequired) + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = State.get(getMask(), Part); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + BlockInMaskParts[Part] = + Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); + } else { + Value *Increment = + createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); + PartPtr->setIsInBounds(InBounds); + } + + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + + // Handle Stores: + if (SI) { + State.ILV->setDebugLocFromInst(SI); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, Part); + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } else { + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } + State.ILV->addMetadata(NewSI, SI); + } + return; + } + + // Handle loads. + assert(LI && "Must have a load instruction"); + State.ILV->setDebugLocFromInst(LI); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + State.ILV->addMetadata(NewLI, LI); + } else { + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + else + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.ILV->addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + } + + State.set(getVPSingleValue(), NewLI, Part); + } } // Determine how to lower the scalar epilogue, which depends on 1) optimising diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3ef0b794f68..95061e9053fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) { return false; } +/// Checks if the given value is actually an undefined constant vector. +static bool isUndefVector(const Value *V) { + if (isa<UndefValue>(V)) + return true; + auto *C = dyn_cast<Constant>(V); + if (!C) + return false; + if (!C->containsUndefOrPoisonElement()) + return false; + auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); + if (!VecTy) + return false; + for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { + if (Constant *Elem = C->getAggregateElement(I)) + if (!isa<UndefValue>(Elem)) + return false; + } + return true; +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) { /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { - auto *EI0 = cast<ExtractElementInst>(VL[0]); + const auto *It = + find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); + if (It == VL.end()) + return None; + auto *EI0 = cast<ExtractElementInst>(*It); if (isa<ScalableVectorType>(EI0->getVectorOperandType())) return None; unsigned Size = @@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; + Mask.assign(VL.size(), UndefMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { + // Undef can be represented as an undef element in a vector. + if (isa<UndefValue>(VL[I])) + continue; auto *EI = cast<ExtractElementInst>(VL[I]); + if (isa<ScalableVectorType>(EI->getVectorOperandType())) + return None; auto *Vec = EI->getVectorOperand(); + // We can extractelement from undef or poison vector. + if (isUndefVector(Vec)) + continue; // All vector operands must have the same number of vector elements. if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; + if (isa<UndefValue>(EI->getIndexOperand())) + continue; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) { - Mask.push_back(UndefMaskElem); + if (Idx->getValue().uge(Size)) continue; - } unsigned IntIdx = Idx->getValue().getZExtValue(); - Mask.push_back(IntIdx); - // We can extractelement from undef or poison vector. - if (isa<UndefValue>(Vec)) - continue; + Mask[I] = IntIdx; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. - if (!Vec1 || Vec1 == Vec) + if (!Vec1 || Vec1 == Vec) { Vec1 = Vec; - else if (!Vec2 || Vec2 == Vec) + } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; - else + Mask[I] += Size; + } else { return None; + } if (CommonShuffleMode == Permute) continue; // If the extract index is not the same as the operation number, it is a @@ -1680,6 +1712,28 @@ private: return IsSame(Scalars, ReuseShuffleIndices); } + /// \returns true if current entry has same operands as \p TE. + bool hasEqualOperands(const TreeEntry &TE) const { + if (TE.getNumOperands() != getNumOperands()) + return false; + SmallBitVector Used(getNumOperands()); + for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { + unsigned PrevCount = Used.count(); + for (unsigned K = 0; K < E; ++K) { + if (Used.test(K)) + continue; + if (getOperand(K) == TE.getOperand(I)) { + Used.set(K); + break; + } + } + // Check if we actually found the matching operand. + if (PrevCount == Used.count()) + return false; + } + return true; + } + /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. @@ -1773,6 +1827,12 @@ private: return Operands[OpIdx]; } + /// \returns the \p OpIdx operand of this TreeEntry. + ArrayRef<Value *> getOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } @@ -2078,7 +2138,7 @@ private: SmallPtrSet<const Value *, 32> EphValues; /// Holds all of the instructions that we gathered. - SetVector<Instruction *> GatherSeq; + SetVector<Instruction *> GatherShuffleSeq; /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool IsGather) { DenseMap<Value *, int> ExtractVectorsTys; for (auto *V : VL) { + if (isa<UndefValue>(V)) + continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || - (IsGather && ScalarToTreeEntry.count(V))) + if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) continue; auto *EE = cast<ExtractElementInst>(V); - unsigned Idx = *getExtractIndex(EE); + Optional<unsigned> EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; if (TTIRef.getNumberOfParts(VecTy) != TTIRef.getNumberOfParts(EE->getVectorOperandType())) { auto It = @@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); + if (Data.second % NumElts == 0) + continue; if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); @@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } - if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && - allSameBlock(VL) && - !isa<ScalableVectorType>( - cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { + if ((E->getOpcode() == Instruction::ExtractElement || + all_of(E->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && + allSameType(VL)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; @@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); })); - if (isa<UndefValue>(FirstInsert->getOperand(0))) { + if (isUndefVector(FirstInsert->getOperand(0))) { Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); } else { SmallVector<int> InsertMask(NumElts); @@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - if (Instruction::isBinaryOp(E->getOpcode())) { + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; + } + return false; + }; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); @@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { [this](Value *V) { return EphValues.contains(V); }) && (allConstant(TE->Scalars) || isSplat(TE->Scalars) || TE->Scalars.size() < Limit || - (TE->getOpcode() == Instruction::ExtractElement && + ((TE->getOpcode() == Instruction::ExtractElement || + all_of(TE->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && isFixedVectorShuffle(TE->Scalars, Mask)) || (TE->State == TreeEntry::NeedToGather && TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); @@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, + InsertElementInst *V) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU || IE1 == V) + return true; + if (IE1) { + if (IE1 != VU && !IE1->hasOneUse()) + IE1 = nullptr; + else + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + } + if (IE2) { + if (IE2 != V && !IE2->hasOneUse()) + IE2 = nullptr; + else + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } + } while (IE1 || IE2); + return false; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) + if (!isa_and_nonnull<InsertElementInst>(EU.User) && + !ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be @@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. - if (isa_and_nonnull<InsertElementInst>(EU.User)) { - if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { - Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { + Optional<int> InsertIdx = getInsertIndex(VU, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - Value *VU = EU.User; auto *It = find_if(FirstUsers, [VU](Value *V) { - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - auto *IE1 = cast<InsertElementInst>(VU); - auto *IE2 = cast<InsertElementInst>(V); - // Go through of insertelement instructions trying to find either VU - // as the original vector for IE2 or V as the original vector for IE1. - do { - if (IE1 == VU || IE2 == V) - return true; - if (IE1) - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - if (IE2) - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } while (IE1 || IE2); - return false; + return areTwoInsertFromSameBuildVector(VU, + cast<InsertElementInst>(V)); }); int VecId = -1; if (It == FirstUsers.end()) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (isa<InsertElementInst>(Base)) { + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(Base)) { + VU = cast<InsertElementInst>(Base); + do { + int Idx = E->findLaneForValue(Base); + ShuffleMask.back()[Idx] = Idx; + Base = cast<InsertElementInst>(Base)->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast<InsertElementInst>(Base)->getOperand(0); + } + FirstUsers.push_back(VU); DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { @@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; DemandedElts[VecId].setBit(Idx); + continue; } } @@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + if (FirstUsers.size() == 1) { + int Limit = ShuffleMask.front().size() * 2; + if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + cast<FixedVectorType>(FirstUsers.front()->getType()), + ShuffleMask.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users " << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; - continue; } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); + cast<FixedVectorType>(FirstUsers.front()->getType()), + DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); Cost -= InsertCost; + } else if (FirstUsers.size() >= 2) { + unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); + // Combined masks of the first 2 vectors. + SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); + copy(ShuffleMask.front(), CombinedMask.begin()); + APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); + auto *VecTy = FixedVectorType::get( + cast<VectorType>(FirstUsers.front()->getType())->getElementType(), + MaxVF); + for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { + if (ShuffleMask[1][I] != UndefMaskElem) { + CombinedMask[I] = ShuffleMask[1][I] + MaxVF; + CombinedDemandedElts.setBit(I); + } + } + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost << " for insertelements gather.\n" << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + // Other elements - permutation of 2 vectors (the initial one and the + // next Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int Mask = ShuffleMask[I][Idx]; + if (Mask != UndefMaskElem) + CombinedMask[Idx] = MaxVF + Mask; + else if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + } + for (unsigned Idx = VF; Idx < MaxVF; ++Idx) + if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } } #ifndef NDEBUG @@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) return Vec; - GatherSeq.insert(InsElt); + GatherShuffleSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder { const unsigned VF = 0; bool IsFinalized = false; SmallVector<int, 4> Mask; + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> &GatherShuffleSeq; + /// A list of blocks that we are going to CSE. + SetVector<BasicBlock *> &CSEBlocks; public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) - : Builder(Builder), VF(VF) {} + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, + SetVector<Instruction *> &GatherShuffleSeq, + SetVector<BasicBlock *> &CSEBlocks) + : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), + CSEBlocks(CSEBlocks) {} /// Adds a mask, inverting it before applying. void addInversedMask(ArrayRef<unsigned> SubMask) { @@ -5804,7 +5983,12 @@ public: if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); + Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; } ~ShuffleInstructionBuilder() { @@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { std::iota(UniformMask.begin(), UniformMask.end(), 0); V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } return V; } @@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { VL = UniqueValues; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } return Vec; } @@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); if (E->State == TreeEntry::NeedToGather) { if (E->getMainOp()) setInsertPointAfterBundle(E); @@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } else { Vec = gather(E->Scalars); } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = Vec; return Vec; @@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IsIdentity &= *InsertIdx - Offset == I; Mask[*InsertIdx - Offset] = I; } - if (!IsIdentity || NumElts != NumScalars) + if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } if ((!IsIdentity || Offset != 0 || - !isa<UndefValue>(FirstInsert->getOperand(0))) && + !isUndefVector(FirstInsert->getOperand(0))) && NumElts != NumScalars) { SmallVector<int> InsertMask(NumElts); std::iota(InsertMask.begin(), InsertMask.end(), 0); @@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = Builder.CreateShuffleVector( FirstInsert->getOperand(0), V, InsertMask, cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } ++NumVectorInstructions; @@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V1 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } + // Add V0 and V1 to later analysis to try to find and remove matching + // instruction, if any. + for (Value *V : {V0, V1}) { + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to @@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, Mask); - if (Instruction *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { V = propagateMetadata(I, E->Scalars); + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { + for (Instruction *I : GatherShuffleSeq) { if (isDeleted(I)) continue; @@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); - if (Op0 && L->contains(Op0)) - continue; - if (Op1 && L->contains(Op1)) + if (any_of(I->operands(), [L](Value *V) { + auto *OpI = dyn_cast<Instruction>(V); + return OpI && L->contains(OpI); + })) continue; // We can hoist this instruction. Move it to the pre-header. @@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() { return A->getDFSNumIn() < B->getDFSNumIn(); }); - // Perform O(N^2) search over the gather sequences and merge identical + // Less defined shuffles can be replaced by the more defined copies. + // Between two shuffles one is less defined if it has the same vector operands + // and its mask indeces are the same as in the first one or undefs. E.g. + // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, + // poison, <0, 0, 0, 0>. + auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, + SmallVectorImpl<int> &NewMask) { + if (I1->getType() != I2->getType()) + return false; + auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); + auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); + if (!SI1 || !SI2) + return I1->isIdenticalTo(I2); + if (SI1->isIdenticalTo(SI2)) + return true; + for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) + if (SI1->getOperand(I) != SI2->getOperand(I)) + return false; + // Check if the second instruction is more defined than the first one. + NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); + ArrayRef<int> SM1 = SI1->getShuffleMask(); + // Count trailing undefs in the mask to check the final number of used + // registers. + unsigned LastUndefsCnt = 0; + for (int I = 0, E = NewMask.size(); I < E; ++I) { + if (SM1[I] == UndefMaskElem) + ++LastUndefsCnt; + else + LastUndefsCnt = 0; + if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + NewMask[I] != SM1[I]) + return false; + if (NewMask[I] == UndefMaskElem) + NewMask[I] = SM1[I]; + } + // Check if the last undefs actually change the final number of used vector + // registers. + return SM1.size() - LastUndefsCnt > 1 && + TTI->getNumberOfParts(SI1->getType()) == + TTI->getNumberOfParts( + FixedVectorType::get(SI1->getType()->getElementType(), + SM1.size() - LastUndefsCnt)); + }; + // Perform O(N^2) search over the gather/shuffle sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; @@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() { if (isDeleted(&In)) continue; if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && - !isa<ShuffleVectorInst>(&In)) + !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. bool Replaced = false; - for (Instruction *v : Visited) { - if (In.isIdenticalTo(v) && - DT->dominates(v->getParent(), In.getParent())) { - In.replaceAllUsesWith(v); + for (Instruction *&V : Visited) { + SmallVector<int> NewMask; + if (IsIdenticalOrLessDefined(&In, V, NewMask) && + DT->dominates(V->getParent(), In.getParent())) { + In.replaceAllUsesWith(V); eraseInstruction(&In); + if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + Replaced = true; + break; + } + if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && + GatherShuffleSeq.contains(V) && + IsIdenticalOrLessDefined(V, &In, NewMask) && + DT->dominates(In.getParent(), V->getParent())) { + In.moveAfter(V); + V->replaceAllUsesWith(&In); + eraseInstruction(V); + if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + V = &In; Replaced = true; break; } @@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() { } } CSEBlocks.clear(); - GatherSeq.clear(); + GatherShuffleSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) @@ -8791,6 +9057,8 @@ private: assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, @@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - (llvm::all_of(BuildVectorOpds, - [](Value *V) { return isa<ExtractElementInst>(V); }) && + (llvm::all_of( + BuildVectorOpds, + [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; @@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, return tryToVectorizeList(BuildVectorInsts, R); } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { - bool OpsChanged = false; - SmallVector<Instruction *, 4> PostponedCmps; - for (auto *I : reverse(Instructions)) { - if (R.isDeleted(I)) - continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) - OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) - OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) - PostponedCmps.push_back(I); - } - if (AtTerminator) { - // Try to find reductions first. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - for (Value *Op : I->operands()) - OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); - } - // Try to vectorize operands as vector bundles. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - OpsChanged |= tryToVectorize(I, R); - } - Instructions.clear(); - } else { - // Insert in reverse order since the PostponedCmps vector was filled in - // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); - } - return OpsChanged; -} - template <typename T> static bool tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, @@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { + bool OpsChanged = false; + SmallVector<Instruction *, 4> PostponedCmps; + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) + continue; + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (isa<CmpInst>(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + // Try to vectorize list of compares. + // Sort by type, compare predicate, etc. + // TODO: Add analysis on the operand opcodes (profitable to vectorize + // instructions with same/alternate opcodes/const values). + auto &&CompareSorter = [&R](Value *V, Value *V2) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return true; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + return CI1->getPredicate() < CI2->getPredicate() || + (CI1->getPredicate() > CI2->getPredicate() && + CI1->getPredicate() < + CmpInst::getSwappedPredicate(CI2->getPredicate())); + }; + + auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + if (V1 == V2) + return true; + auto *CI1 = cast<CmpInst>(V1); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) + return false; + return CI1->getPredicate() == CI2->getPredicate() || + CI1->getPredicate() == + CmpInst::getSwappedPredicate(CI2->getPredicate()); + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; + + SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); + OpsChanged |= tryToVectorizeSequence<Value>( + Vals, Limit, CompareSorter, AreCompatibleCompares, + [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { + // Exclude possible reductions from other blocks. + bool ArePossiblyReducedInOtherBlock = + any_of(Candidates, [](Value *V) { + return any_of(V->users(), [V](User *U) { + return isa<SelectInst>(U) && + cast<SelectInst>(U)->getParent() != + cast<Instruction>(V)->getParent(); + }); + }); + if (ArePossiblyReducedInOtherBlock) + return false; + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + } + return OpsChanged; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 638467f94e1c..44b5e1df0839 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); + IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); + State.Builder.setFastMathFlags(FMF); for (unsigned Part = 0; Part < State.UF; ++Part) generateInstruction(State, Part); } @@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << Instruction::getOpcodeName(getOpcode()); } + O << FMF; + for (const VPValue *Operand : operands()) { O << " "; Operand->printAsOperand(O, SlotTracker); @@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, } #endif +void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { + // Make sure the VPInstruction is a floating-point operation. + assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp) && + "this op can't take fast-math flags"); + FMF = FMFNew; +} + /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. @@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); - O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) - << " ("; + O << " +"; + if (isa<FPMathOperator>(getUnderlyingInstr())) + O << getUnderlyingInstr()->getFastMathFlags(); + O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 00ee31007cb7..810dd5030f95 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -59,6 +59,7 @@ class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPReplicateRecipe; class VPlanSlp; /// Returns a calculation for the total number of elements for a given \p VF. @@ -346,6 +347,10 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; + + /// Holds recipes that may generate a poison value that is used after + /// vectorization, even when their operands are not poison. + SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes; }; /// VPUsers instance used by VPBlockBase to manage CondBit and the block @@ -789,6 +794,7 @@ public: private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; + FastMathFlags FMF; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -802,13 +808,6 @@ public: : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} - VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { - for (auto *I : Operands) - addOperand(I->getVPSingleValue()); - } - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} @@ -870,6 +869,9 @@ public: return true; } } + + /// Set the fast-math flags. + void setFastMathFlags(FastMathFlags FMFNew); }; /// VPWidenRecipe is a recipe for producing a copy of vector type its @@ -1511,7 +1513,7 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue { Instruction &Ingredient; // Whether the loaded-from / stored-to addresses are consecutive. @@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, bool Consecutive, bool Reverse) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), + VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } @@ -1544,6 +1546,7 @@ public: VPValue *StoredValue, VPValue *Mask, bool Consecutive, bool Reverse) : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), + VPValue(VPValue::VPVMemoryInstructionSC, &Store, this), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); setMask(Mask); |