diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
18 files changed, 6349 insertions, 4296 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 0b7fc853dc1b..260d7889906b 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -37,13 +37,34 @@ // multiple scalar registers, similar to a GPU vectorized load. In theory ARM // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +// +// Overview of the algorithm and terminology in this pass: +// +// - Break up each basic block into pseudo-BBs, composed of instructions which +// are guaranteed to transfer control to their successors. +// - Within a single pseudo-BB, find all loads, and group them into +// "equivalence classes" according to getUnderlyingObject() and loaded +// element size. Do the same for stores. +// - For each equivalence class, greedily build "chains". Each chain has a +// leader instruction, and every other member of the chain has a known +// constant offset from the first instr in the chain. +// - Break up chains so that they contain only contiguous accesses of legal +// size with no intervening may-alias instrs. +// - Convert each chain to vector instructions. +// +// The O(n^2) behavior of this pass comes from initially building the chains. +// In the worst case we have to compare each new instruction to all of those +// that came before. To limit this, we only calculate the offset to the leaders +// of the N most recently-used chains. #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -57,6 +78,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -67,23 +89,33 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/ModRef.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <cassert> +#include <cstdint> #include <cstdlib> +#include <iterator> +#include <limits> +#include <numeric> +#include <optional> #include <tuple> +#include <type_traits> #include <utility> +#include <vector> using namespace llvm; @@ -92,21 +124,115 @@ using namespace llvm; STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); +namespace { + +// Equivalence class key, the initial tuple by which we group loads/stores. +// Loads/stores with different EqClassKeys are never merged. +// +// (We could in theory remove element-size from the this tuple. We'd just need +// to fix up the vector packing/unpacking code.) +using EqClassKey = + std::tuple<const Value * /* result of getUnderlyingObject() */, + unsigned /* AddrSpace */, + unsigned /* Load/Store element size bits */, + char /* IsLoad; char b/c bool can't be a DenseMap key */ + >; +[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const EqClassKey &K) { + const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K; + return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject + << " of element size " << ElementSize << " bits in addrspace " + << AddrSpace; +} + +// A Chain is a set of instructions such that: +// - All instructions have the same equivalence class, so in particular all are +// loads, or all are stores. +// - We know the address accessed by the i'th chain elem relative to the +// chain's leader instruction, which is the first instr of the chain in BB +// order. +// +// Chains have two canonical orderings: +// - BB order, sorted by Instr->comesBefore. +// - Offset order, sorted by OffsetFromLeader. +// This pass switches back and forth between these orders. +struct ChainElem { + Instruction *Inst; + APInt OffsetFromLeader; +}; +using Chain = SmallVector<ChainElem, 1>; + +void sortChainInBBOrder(Chain &C) { + sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); }); +} + +void sortChainInOffsetOrder(Chain &C) { + sort(C, [](const auto &A, const auto &B) { + if (A.OffsetFromLeader != B.OffsetFromLeader) + return A.OffsetFromLeader.slt(B.OffsetFromLeader); + return A.Inst->comesBefore(B.Inst); // stable tiebreaker + }); +} + +[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) { + for (const auto &E : C) { + dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n"; + } +} + +using EquivalenceClassMap = + MapVector<EqClassKey, SmallVector<Instruction *, 8>>; + // FIXME: Assuming stack alignment of 4 is always good enough -static const unsigned StackAdjustedAlignment = 4; +constexpr unsigned StackAdjustedAlignment = 4; -namespace { +Instruction *propagateMetadata(Instruction *I, const Chain &C) { + SmallVector<Value *, 8> Values; + for (const ChainElem &E : C) + Values.push_back(E.Inst); + return propagateMetadata(I, Values); +} -/// ChainID is an arbitrary token that is allowed to be different only for the -/// accesses that are guaranteed to be considered non-consecutive by -/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions -/// together and reducing the number of instructions the main search operates on -/// at a time, i.e. this is to reduce compile time and nothing else as the main -/// search has O(n^2) time complexity. The underlying type of ChainID should not -/// be relied upon. -using ChainID = const Value *; -using InstrList = SmallVector<Instruction *, 8>; -using InstrListMap = MapVector<ChainID, InstrList>; +bool isInvariantLoad(const Instruction *I) { + const LoadInst *LI = dyn_cast<LoadInst>(I); + return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load); +} + +/// Reorders the instructions that I depends on (the instructions defining its +/// operands), to ensure they dominate I. +void reorder(Instruction *I) { + SmallPtrSet<Instruction *, 16> InstructionsToMove; + SmallVector<Instruction *, 16> Worklist; + + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *IW = Worklist.pop_back_val(); + int NumOperands = IW->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; + + // If IM is in another BB, no need to move it, because this pass only + // vectorizes instructions within one BB. + if (IM->getParent() != I->getParent()) + continue; + + if (!IM->comesBefore(I)) { + InstructionsToMove.insert(IM); + Worklist.push_back(IM); + } + } + } + + // All instructions to move should follow I. Start from I, not from begin(). + for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) { + Instruction *IM = &*(BBI++); + if (!InstructionsToMove.count(IM)) + continue; + IM->moveBefore(I); + } +} class Vectorizer { Function &F; @@ -118,6 +244,12 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; + // We could erase instrs right after vectorizing them, but that can mess up + // our BB iterators, and also can make the equivalence class keys point to + // freed memory. This is fixable, but it's simpler just to wait until we're + // done with the BB and erase all at once. + SmallVector<Instruction *, 128> ToErase; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -127,70 +259,83 @@ public: bool run(); private: - unsigned getPointerAddressSpace(Value *I); - static const unsigned MaxDepth = 3; - bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth = 0) const; - bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth) const; - bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, - unsigned Depth) const; - - /// After vectorization, reorder the instructions that I depends on - /// (the instructions defining its operands), to ensure they dominate I. - void reorder(Instruction *I); - - /// Returns the first and the last instructions in Chain. - std::pair<BasicBlock::iterator, BasicBlock::iterator> - getBoundaryInstrs(ArrayRef<Instruction *> Chain); - - /// Erases the original instructions after vectorizing. - void eraseInstructions(ArrayRef<Instruction *> Chain); - - /// "Legalize" the vector type that would be produced by combining \p - /// ElementSizeBits elements in \p Chain. Break into two pieces such that the - /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is - /// expected to have more than 4 elements. - std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> - splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); - - /// Finds the largest prefix of Chain that's vectorizable, checking for - /// intervening instructions which may affect the memory accessed by the - /// instructions within Chain. + /// Runs the vectorizer on a "pseudo basic block", which is a range of + /// instructions [Begin, End) within one BB all of which have + /// isGuaranteedToTransferExecutionToSuccessor(I) == true. + bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); + + /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores + /// in the same BB with the same value for getUnderlyingObject() etc. + bool runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef<Instruction *> EqClass); + + /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class + /// where all instructions access a known, constant offset from the first + /// instruction. + bool runOnChain(Chain &C); + + /// Splits the chain into subchains of instructions which read/write a + /// contiguous block of memory. Discards any length-1 subchains (because + /// there's nothing to vectorize in there). + std::vector<Chain> splitChainByContiguity(Chain &C); + + /// Splits the chain into subchains where it's safe to hoist loads up to the + /// beginning of the sub-chain and it's safe to sink loads up to the end of + /// the sub-chain. Discards any length-1 subchains. + std::vector<Chain> splitChainByMayAliasInstrs(Chain &C); + + /// Splits the chain into subchains that make legal, aligned accesses. + /// Discards any length-1 subchains. + std::vector<Chain> splitChainByAlignment(Chain &C); + + /// Converts the instrs in the chain into a single vectorized load or store. + /// Adds the old scalar loads/stores to ToErase. + bool vectorizeChain(Chain &C); + + /// Tries to compute the offset in bytes PtrB - PtrA. + std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth = 0); + std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth); + std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth); + + /// Gets the element type of the vector that the chain will load or store. + /// This is nontrivial because the chain may contain elements of different + /// types; e.g. it's legal to have a chain that contains both i32 and float. + Type *getChainElemTy(const Chain &C); + + /// Determines whether ChainElem can be moved up (if IsLoad) or down (if + /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias + /// instructions. + /// + /// The map ChainElemOffsets must contain all of the elements in + /// [ChainBegin, ChainElem] and their offsets from some arbitrary base + /// address. It's ok if it contains additional entries. + template <bool IsLoadChain> + bool isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); + + /// Collects loads and stores grouped by "equivalence class", where: + /// - all elements in an eq class are a load or all are a store, + /// - they all load/store the same element size (it's OK to have e.g. i8 and + /// <4 x i8> in the same class, but not i32 and <4 x i8>), and + /// - they all have the same value for getUnderlyingObject(). + EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End); + + /// Partitions Instrs into "chains" where every instruction has a known + /// constant offset from the first instr in the chain. /// - /// The elements of \p Chain must be all loads or all stores and must be in - /// address order. - ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); - - /// Collects load and store instructions to vectorize. - std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); - - /// Processes the collected instructions, the \p Map. The values of \p Map - /// should be all loads or all stores. - bool vectorizeChains(InstrListMap &Map); - - /// Finds the load/stores to consecutive memory addresses and vectorizes them. - bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); - - /// Vectorizes the load instructions in Chain. - bool - vectorizeLoadChain(ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed); - - /// Vectorizes the store instructions in Chain. - bool - vectorizeStoreChain(ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed); - - /// Check if this load/store access is misaligned accesses. - /// Returns a \p RelativeSpeed of an operation if allowed suitable to - /// compare to another result for the same \p AddressSpace and potentially - /// different \p Alignment and \p SzInBytes. - bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed); + /// Postcondition: For all i, ret[i][0].second == 0, because the first instr + /// in the chain is the leader, and an instr touches distance 0 from itself. + std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -198,7 +343,8 @@ public: static char ID; LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { - initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); + initializeLoadStoreVectorizerLegacyPassPass( + *PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -250,11 +396,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { AssumptionCache &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - return V.run(); + return Vectorizer(F, AA, AC, DT, SE, TTI).run(); } -PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { +PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, + FunctionAnalysisManager &AM) { // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return PreservedAnalyses::all(); @@ -265,125 +411,681 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - bool Changed = V.run(); + bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); return Changed ? PA : PreservedAnalyses::all(); } -// The real propagateMetadata expects a SmallVector<Value*>, but we deal in -// vectors of Instructions. -static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { - SmallVector<Value *, 8> VL(IL.begin(), IL.end()); - propagateMetadata(I, VL); -} - -// Vectorizer Implementation bool Vectorizer::run() { bool Changed = false; - - // Scan the blocks in the function in post order. + // Break up the BB if there are any instrs which aren't guaranteed to transfer + // execution to their successor. + // + // Consider, for example: + // + // def assert_arr_len(int n) { if (n < 2) exit(); } + // + // load arr[0] + // call assert_array_len(arr.length) + // load arr[1] + // + // Even though assert_arr_len does not read or write any memory, we can't + // speculate the second load before the call. More info at + // https://github.com/llvm/llvm-project/issues/52950. for (BasicBlock *BB : post_order(&F)) { - InstrListMap LoadRefs, StoreRefs; - std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); - Changed |= vectorizeChains(LoadRefs); - Changed |= vectorizeChains(StoreRefs); + // BB must at least have a terminator. + assert(!BB->empty()); + + SmallVector<BasicBlock::iterator, 8> Barriers; + Barriers.push_back(BB->begin()); + for (Instruction &I : *BB) + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + Barriers.push_back(I.getIterator()); + Barriers.push_back(BB->end()); + + for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End; + ++It) + Changed |= runOnPseudoBB(*It, *std::next(It)); + + for (Instruction *I : ToErase) { + auto *PtrOperand = getLoadStorePointerOperand(I); + if (I->use_empty()) + I->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + ToErase.clear(); } return Changed; } -unsigned Vectorizer::getPointerAddressSpace(Value *I) { - if (LoadInst *L = dyn_cast<LoadInst>(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast<StoreInst>(I)) - return S->getPointerAddressSpace(); - return -1; +bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... "; + if (End != Begin->getParent()->end()) + dbgs() << *End; + else + dbgs() << "<BB end>"; + dbgs() << ")\n"; + }); + + bool Changed = false; + for (const auto &[EqClassKey, EqClass] : + collectEquivalenceClasses(Begin, End)) + Changed |= runOnEquivalenceClass(EqClassKey, EqClass); + + return Changed; } -// FIXME: Merge with llvm::isConsecutiveAccess -bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getLoadStorePointerOperand(A); - Value *PtrB = getLoadStorePointerOperand(B); - unsigned ASA = getPointerAddressSpace(A); - unsigned ASB = getPointerAddressSpace(B); +bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef<Instruction *> EqClass) { + bool Changed = false; - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; + LLVM_DEBUG({ + dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() + << " keyed on " << EqClassKey << ":\n"; + for (Instruction *I : EqClass) + dbgs() << " " << *I << "\n"; + }); - // Make sure that A and B are different pointers of the same size type. - Type *PtrATy = getLoadStoreType(A); - Type *PtrBTy = getLoadStoreType(B); - if (PtrA == PtrB || - PtrATy->isVectorTy() != PtrBTy->isVectorTy() || - DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || - DL.getTypeStoreSize(PtrATy->getScalarType()) != - DL.getTypeStoreSize(PtrBTy->getScalarType())) - return false; + std::vector<Chain> Chains = gatherChains(EqClass); + LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() + << " nontrivial chains.\n";); + for (Chain &C : Chains) + Changed |= runOnChain(C); + return Changed; +} - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); - APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); +bool Vectorizer::runOnChain(Chain &C) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; + dumpChain(C); + }); - return areConsecutivePointers(PtrA, PtrB, Size); + // Split up the chain into increasingly smaller chains, until we can finally + // vectorize the chains. + // + // (Don't be scared by the depth of the loop nest here. These operations are + // all at worst O(n lg n) in the number of instructions, and splitting chains + // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) + bool Changed = false; + for (auto &C : splitChainByMayAliasInstrs(C)) + for (auto &C : splitChainByContiguity(C)) + for (auto &C : splitChainByAlignment(C)) + Changed |= vectorizeChain(C); + return Changed; } -bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, - APInt PtrDelta, unsigned Depth) const { - unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); - APInt OffsetA(PtrBitWidth, 0); - APInt OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); +std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) { + if (C.empty()) + return {}; - unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + sortChainInBBOrder(C); - if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n"; + dumpChain(C); + }); + + // We know that elements in the chain with nonverlapping offsets can't + // alias, but AA may not be smart enough to figure this out. Use a + // hashtable. + DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets; + for (const auto &E : C) + ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader}); + + // Loads get hoisted up to the first load in the chain. Stores get sunk + // down to the last store in the chain. Our algorithm for loads is: + // + // - Take the first element of the chain. This is the start of a new chain. + // - Take the next element of `Chain` and check for may-alias instructions + // up to the start of NewChain. If no may-alias instrs, add it to + // NewChain. Otherwise, start a new NewChain. + // + // For stores it's the same except in the reverse direction. + // + // We expect IsLoad to be an std::bool_constant. + auto Impl = [&](auto IsLoad) { + // MSVC is unhappy if IsLoad is a capture, so pass it as an arg. + auto [ChainBegin, ChainEnd] = [&](auto IsLoad) { + if constexpr (IsLoad()) + return std::make_pair(C.begin(), C.end()); + else + return std::make_pair(C.rbegin(), C.rend()); + }(IsLoad); + assert(ChainBegin != ChainEnd); + + std::vector<Chain> Chains; + SmallVector<ChainElem, 1> NewChain; + NewChain.push_back(*ChainBegin); + for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { + if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, + ChainOffsets)) { + LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst + << "\n"); + NewChain.push_back(*ChainIt); + } else { + LLVM_DEBUG( + dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + + // Start a new chain. + NewChain = SmallVector<ChainElem, 1>({*ChainIt}); + } + } + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + return Chains; + }; + + if (isa<LoadInst>(C[0].Inst)) + return Impl(/*IsLoad=*/std::bool_constant<true>()); + + assert(isa<StoreInst>(C[0].Inst)); + return Impl(/*IsLoad=*/std::bool_constant<false>()); +} + +std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) { + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByContiguity considering chain:\n"; + dumpChain(C); + }); + + std::vector<Chain> Ret; + Ret.push_back({C.front()}); + + for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) { + // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd). + auto &CurChain = Ret.back(); + const ChainElem &Prev = CurChain.back(); + unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); + assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " + "collectEquivalenceClass"); + APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; + + // Add this instruction to the end of the current chain, or start a new one. + bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; + LLVM_DEBUG(dbgs() << "LSV: Instructions are " + << (AreContiguous ? "" : "not ") << "contiguous: " + << *Prev.Inst << " (ends at offset " << PrevReadEnd + << ") -> " << *It->Inst << " (starts at offset " + << It->OffsetFromLeader << ")\n"); + if (AreContiguous) + CurChain.push_back(*It); + else + Ret.push_back({*It}); + } + + // Filter out length-1 chains, these are uninteresting. + llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); + return Ret; +} + +Type *Vectorizer::getChainElemTy(const Chain &C) { + assert(!C.empty()); + // The rules are: + // - If there are any pointer types in the chain, use an integer type. + // - Prefer an integer type if it appears in the chain. + // - Otherwise, use the first type in the chain. + // + // The rule about pointer types is a simplification when we merge e.g. a load + // of a ptr and a double. There's no direct conversion from a ptr to a + // double; it requires a ptrtoint followed by a bitcast. + // + // It's unclear to me if the other rules have any practical effect, but we do + // it to match this pass's previous behavior. + if (any_of(C, [](const ChainElem &E) { + return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); + })) { + return Type::getIntNTy( + F.getContext(), + DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); + } + + for (const ChainElem &E : C) + if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy()) + return T; + return getLoadStoreType(C[0].Inst)->getScalarType(); +} + +std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) { + // We use a simple greedy algorithm. + // - Given a chain of length N, find all prefixes that + // (a) are not longer than the max register length, and + // (b) are a power of 2. + // - Starting from the longest prefix, try to create a vector of that length. + // - If one of them works, great. Repeat the algorithm on any remaining + // elements in the chain. + // - If none of them work, discard the first element and repeat on a chain + // of length N-1. + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByAlignment considering chain:\n"; + dumpChain(C); + }); + + bool IsLoadChain = isa<LoadInst>(C[0].Inst); + auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize, + unsigned ChainSizeBytes, VectorType *VecTy) { + return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy) + : TTI.getStoreVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy); + }; + +#ifndef NDEBUG + for (const auto &E : C) { + Type *Ty = getLoadStoreType(E.Inst)->getScalarType(); + assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) && + "Should have filtered out non-power-of-two elements in " + "collectEquivalenceClasses."); + } +#endif + + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + + std::vector<Chain> Ret; + for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { + // Find candidate chains of size not greater than the largest vector reg. + // These chains are over the closed interval [CBegin, CEnd]. + SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8> + CandidateChains; + for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) { + APInt Sz = C[CEnd].OffsetFromLeader + + DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) - + C[CBegin].OffsetFromLeader; + if (Sz.sgt(VecRegBytes)) + break; + CandidateChains.push_back( + {CEnd, static_cast<unsigned>(Sz.getLimitedValue())}); + } + + // Consider the longest chain first. + for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend(); + It != End; ++It) { + auto [CEnd, SizeBytes] = *It; + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment considering candidate chain [" + << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n"); + + Type *VecElemTy = getChainElemTy(C); + // Note, VecElemTy is a power of 2, but might be less than one byte. For + // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case + // VecElemTy would be i4. + unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy); + + // SizeBytes and VecElemBits are powers of 2, so they divide evenly. + assert((8 * SizeBytes) % VecElemBits == 0); + unsigned NumVecElems = 8 * SizeBytes / VecElemBits; + FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems); + unsigned VF = 8 * VecRegBytes / VecElemBits; + + // Check that TTI is happy with this vectorization factor. + unsigned TargetVF = getVectorFactor(VF, VecElemBits, + VecElemBits * NumVecElems / 8, VecTy); + if (TargetVF != VF && TargetVF < NumVecElems) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because TargetVF=" + << TargetVF << " != VF=" << VF + << " and TargetVF < NumVecElems=" << NumVecElems << "\n"); + continue; + } + + // Is a load/store with this alignment allowed by TTI and at least as fast + // as an unvectorized load/store? + // + // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). + auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, + &F = F](Align Alignment) { + if (Alignment.value() % SizeBytes == 0) + return true; + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; + }; + + // If we're loading/storing from an alloca, align it if possible. + // + // FIXME: We eagerly upgrade the alignment, regardless of whether TTI + // tells us this is beneficial. This feels a bit odd, but it matches + // existing tests. This isn't *so* bad, because at most we align to 4 + // bytes (current value of StackAdjustedAlignment). + // + // FIXME: We will upgrade the alignment of the alloca even if it turns out + // we can't vectorize for some other reason. + Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst); + bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() && + isa<AllocaInst>(PtrOperand->stripPointerCasts()); + Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); + Align PrefAlign = Align(StackAdjustedAlignment); + if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && + IsAllowedAndFast(PrefAlign)) { + Align NewAlign = getOrEnforceKnownAlignment( + PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); + if (NewAlign >= Alignment) { + LLVM_DEBUG(dbgs() + << "LSV: splitByChain upgrading alloca alignment from " + << Alignment.value() << " to " << NewAlign.value() + << "\n"); + Alignment = NewAlign; + } + } + + if (!IsAllowedAndFast(Alignment)) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because its alignment is not AllowedAndFast: " + << Alignment.value() << "\n"); + continue; + } + + if ((IsLoadChain && + !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) || + (!IsLoadChain && + !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because !isLegalToVectorizeLoad/StoreChain."); + continue; + } + + // Hooray, we can vectorize this chain! + Chain &NewChain = Ret.emplace_back(); + for (unsigned I = CBegin; I <= CEnd; ++I) + NewChain.push_back(C[I]); + CBegin = CEnd; // Skip over the instructions we've added to the chain. + break; + } + } + return Ret; +} + +bool Vectorizer::vectorizeChain(Chain &C) { + if (C.size() < 2) return false; - // In case if we have to shrink the pointer - // stripAndAccumulateInBoundsConstantOffsets should properly handle a - // possible overflow and the value should fit into a smallest data type - // used in the cast/gep chain. - assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && - OffsetB.getMinSignedBits() <= NewPtrBitWidth); + sortChainInOffsetOrder(C); - OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); - OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); - PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); + LLVM_DEBUG({ + dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n"; + dumpChain(C); + }); - APInt OffsetDelta = OffsetB - OffsetA; + Type *VecElemTy = getChainElemTy(C); + bool IsLoadChain = isa<LoadInst>(C[0].Inst); + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned ChainBytes = std::accumulate( + C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) { + return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst)); + }); + assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0); + // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller + // than 1 byte (e.g. VecTy == <32 x i1>). + Type *VecTy = FixedVectorType::get( + VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy)); + + Align Alignment = getLoadStoreAlignment(C[0].Inst); + // If this is a load/store of an alloca, we might have upgraded the alloca's + // alignment earlier. Get the new alignment. + if (AS == DL.getAllocaAddrSpace()) { + Alignment = std::max( + Alignment, + getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst), + MaybeAlign(), DL, C[0].Inst, nullptr, &DT)); + } - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == PtrDelta; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the pointer delta requested. - APInt BaseDelta = PtrDelta - OffsetDelta; - - // Compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE.getSCEV(PtrA); - const SCEV *PtrSCEVB = SE.getSCEV(PtrB); - const SCEV *C = SE.getConstant(BaseDelta); - const SCEV *X = SE.getAddExpr(PtrSCEVA, C); - if (X == PtrSCEVB) + // All elements of the chain must have the same scalar-type size. +#ifndef NDEBUG + for (const ChainElem &E : C) + assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) == + DL.getTypeStoreSize(VecElemTy)); +#endif + + Instruction *VecInst; + if (IsLoadChain) { + // Loads get hoisted to the location of the first load in the chain. We may + // also need to hoist the (transitive) operands of the loads. + Builder.SetInsertPoint( + std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + Value *Bitcast = Builder.CreateBitCast( + getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS)); + VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); + + unsigned VecIdx = 0; + for (const ChainElem &E : C) { + Instruction *I = E.Inst; + Value *V; + Type *T = getLoadStoreType(I); + if (auto *VT = dyn_cast<FixedVectorType>(T)) { + auto Mask = llvm::to_vector<8>( + llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements())); + V = Builder.CreateShuffleVector(VecInst, Mask, I->getName()); + VecIdx += VT->getNumElements(); + } else { + V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx), + I->getName()); + ++VecIdx; + } + if (V->getType() != I->getType()) + V = Builder.CreateBitOrPointerCast(V, I->getType()); + I->replaceAllUsesWith(V); + } + + // Finally, we need to reorder the instrs in the BB so that the (transitive) + // operands of VecInst appear before it. To see why, suppose we have + // vectorized the following code: + // + // ptr1 = gep a, 1 + // load1 = load i32 ptr1 + // ptr0 = gep a, 0 + // load0 = load i32 ptr0 + // + // We will put the vectorized load at the location of the earliest load in + // the BB, i.e. load1. We get: + // + // ptr1 = gep a, 1 + // loadv = load <2 x i32> ptr0 + // load0 = extractelement loadv, 0 + // load1 = extractelement loadv, 1 + // ptr0 = gep a, 0 + // + // Notice that loadv uses ptr0, which is defined *after* it! + reorder(VecInst); + } else { + // Stores get sunk to the location of the last store in the chain. + Builder.SetInsertPoint( + std::max_element(C.begin(), C.end(), [](auto &A, auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Build the vector to store. + Value *Vec = PoisonValue::get(VecTy); + unsigned VecIdx = 0; + auto InsertElem = [&](Value *V) { + if (V->getType() != VecElemTy) + V = Builder.CreateBitOrPointerCast(V, VecElemTy); + Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++)); + }; + for (const ChainElem &E : C) { + auto I = cast<StoreInst>(E.Inst); + if (FixedVectorType *VT = + dyn_cast<FixedVectorType>(getLoadStoreType(I))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + InsertElem(Builder.CreateExtractElement(I->getValueOperand(), + Builder.getInt32(J))); + } + } else { + InsertElem(I->getValueOperand()); + } + } + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, + Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst), + VecTy->getPointerTo(AS)), + Alignment); + } + + propagateMetadata(VecInst, C); + + for (const ChainElem &E : C) + ToErase.push_back(E.Inst); + + ++NumVectorInstructions; + NumScalarsVectorized += C.size(); + return true; +} + +template <bool IsLoadChain> +bool Vectorizer::isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) { + LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " + << *ChainBegin << ")\n"); + + assert(isa<LoadInst>(ChainElem) == IsLoadChain); + if (ChainElem == ChainBegin) return true; - // The above check will not catch the cases where one of the pointers is - // factorized but the other one is not, such as (C + (S * (A + B))) vs - // (AS + BS). Get the minus scev. That will allow re-combining the expresions - // and getting the simplified difference. - const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); - if (C == Dist) + // Invariant loads can always be reordered; by definition they are not + // clobbered by stores. + if (isInvariantLoad(ChainElem)) return true; - // Sometimes even this doesn't work, because SCEV can't always see through - // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking - // things the hard way. - return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); + auto BBIt = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainElem); + else + return BasicBlock::iterator(ChainElem); + }()); + auto BBItEnd = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainBegin); + else + return BasicBlock::iterator(ChainBegin); + }()); + + const APInt &ChainElemOffset = ChainOffsets.at(ChainElem); + const unsigned ChainElemSize = + DL.getTypeStoreSize(getLoadStoreType(ChainElem)); + + for (; BBIt != BBItEnd; ++BBIt) { + Instruction *I = &*BBIt; + + if (!I->mayReadOrWriteMemory()) + continue; + + // Loads can be reordered with other loads. + if (IsLoadChain && isa<LoadInst>(I)) + continue; + + // Stores can be sunk below invariant loads. + if (!IsLoadChain && isInvariantLoad(I)) + continue; + + // If I is in the chain, we can tell whether it aliases ChainIt by checking + // what offset ChainIt accesses. This may be better than AA is able to do. + // + // We should really only have duplicate offsets for stores (the duplicate + // loads should be CSE'ed), but in case we have a duplicate load, we'll + // split the chain so we don't have to handle this case specially. + if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) { + // I and ChainElem overlap if: + // - I and ChainElem have the same offset, OR + // - I's offset is less than ChainElem's, but I touches past the + // beginning of ChainElem, OR + // - ChainElem's offset is less than I's, but ChainElem touches past the + // beginning of I. + const APInt &IOffset = OffsetIt->second; + unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I)); + if (IOffset == ChainElemOffset || + (IOffset.sle(ChainElemOffset) && + (IOffset + IElemSize).sgt(ChainElemOffset)) || + (ChainElemOffset.sle(IOffset) && + (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) { + LLVM_DEBUG({ + // Double check that AA also sees this alias. If not, we probably + // have a bug. + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); + dbgs() << "LSV: Found alias in chain: " << *I << "\n"; + }); + return false; // We found an aliasing instruction; bail. + } + + continue; // We're confident there's no alias. + } + + LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n"); + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { + LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" + << " Aliasing instruction:\n" + << " " << *I << '\n' + << " Aliased instruction and pointer:\n" + << " " << *ChainElem << '\n' + << " " << *getLoadStorePointerOperand(ChainElem) + << '\n'); + + return false; + } + } + return true; } static bool checkNoWrapFlags(Instruction *I, bool Signed) { @@ -395,10 +1097,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) { static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed) { - // If both OpA and OpB is an add with NSW/NUW and with - // one of the operands being the same, we can guarantee that the - // transformation is safe if we can prove that OpA won't overflow when - // IdxDiff added to the other operand of OpA. + LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff + << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA=" + << MatchingOpIdxA << ", AddOpB=" << *AddOpB + << ", MatchingOpIdxB=" << MatchingOpIdxB + << ", Signed=" << Signed << "\n"); + // If both OpA and OpB are adds with NSW/NUW and with one of the operands + // being the same, we can guarantee that the transformation is safe if we can + // prove that OpA won't overflow when Ret added to the other operand of OpA. // For example: // %tmp7 = add nsw i32 %tmp2, %v0 // %tmp8 = sext i32 %tmp7 to i64 @@ -407,10 +1113,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, // %tmp12 = add nsw i32 %tmp2, %tmp11 // %tmp13 = sext i32 %tmp12 to i64 // - // Both %tmp7 and %tmp2 has the nsw flag and the first operand - // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow - // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the - // nsw flag. + // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2. + // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds + // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag. assert(AddOpA->getOpcode() == Instruction::Add && AddOpB->getOpcode() == Instruction::Add && checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); @@ -461,24 +1166,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, return false; } -bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, - APInt PtrDelta, - unsigned Depth) const { +std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( + Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA + << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst + << " Depth=" << Depth << "\n"); auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); if (!GEPA || !GEPB) - return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); + return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth); // Look through GEPs after checking they're the same except for the last // index. if (GEPA->getNumOperands() != GEPB->getNumOperands() || GEPA->getPointerOperand() != GEPB->getPointerOperand()) - return false; + return std::nullopt; gep_type_iterator GTIA = gep_type_begin(GEPA); gep_type_iterator GTIB = gep_type_begin(GEPB); for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { if (GTIA.getOperand() != GTIB.getOperand()) - return false; + return std::nullopt; ++GTIA; ++GTIB; } @@ -487,23 +1194,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) - return false; + return std::nullopt; - if (PtrDelta.isNegative()) { - if (PtrDelta.isMinSignedValue()) - return false; - PtrDelta.negate(); - std::swap(OpA, OpB); - } uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); - if (PtrDelta.urem(Stride) != 0) - return false; - unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); - APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth); // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) - return false; + return std::nullopt; bool Signed = isa<SExtInst>(OpA); @@ -511,7 +1208,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Value *ValA = OpA->getOperand(0); OpB = dyn_cast<Instruction>(OpB->getOperand(0)); if (!OpB || ValA->getType() != OpB->getType()) - return false; + return std::nullopt; + + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); + const SCEV *OffsetSCEVB = SE.getSCEV(OpB); + const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); + if (IdxDiffSCEV == SE.getCouldNotCompute()) + return std::nullopt; + + ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV); + if (!IdxDiffRange.isSingleElement()) + return std::nullopt; + APInt IdxDiff = *IdxDiffRange.getSingleElement(); + + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff + << "\n"); // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; @@ -530,10 +1241,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && checkNoWrapFlags(OpB, Signed)) { - // In the checks below a matching operand in OpA and OpB is - // an operand which is the same in those two instructions. - // Below we account for possible orders of the operands of - // these add instructions. + // In the checks below a matching operand in OpA and OpB is an operand which + // is the same in those two instructions. Below we account for possible + // orders of the operands of these add instructions. for (unsigned MatchingOpIdxA : {0, 1}) for (unsigned MatchingOpIdxB : {0, 1}) if (!Safe) @@ -544,802 +1254,267 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Third attempt: - // If all set bits of IdxDiff or any higher order bit other than the sign bit - // are known to be zero in ValA, we can add Diff to it while guaranteeing no - // overflow of any sort. + // + // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher + // order bit other than the sign bit are known to be zero in ValA, we can add + // Diff to it while guaranteeing no overflow of any sort. + // + // If IdxDiff is negative, do the same, but swap ValA and ValB. if (!Safe) { + // When computing known bits, use the GEPs as context instructions, since + // they likely are in the same BB as the load/store. KnownBits Known(BitWidth); - computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT); + computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC, + ContextInst, &DT); APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); if (Signed) BitsAllowedToBeSet.clearBit(BitWidth - 1); - if (BitsAllowedToBeSet.ult(IdxDiff)) - return false; + if (BitsAllowedToBeSet.ult(IdxDiff.abs())) + return std::nullopt; + Safe = true; } - const SCEV *OffsetSCEVA = SE.getSCEV(ValA); - const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); - const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); - return X == OffsetSCEVB; + if (Safe) + return IdxDiff * Stride; + return std::nullopt; } -bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, - const APInt &PtrDelta, - unsigned Depth) const { +std::optional<APInt> Vectorizer::getConstantOffsetSelects( + Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { if (Depth++ == MaxDepth) - return false; + return std::nullopt; if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { - return SelectA->getCondition() == SelectB->getCondition() && - areConsecutivePointers(SelectA->getTrueValue(), - SelectB->getTrueValue(), PtrDelta, Depth) && - areConsecutivePointers(SelectA->getFalseValue(), - SelectB->getFalseValue(), PtrDelta, Depth); + if (SelectA->getCondition() != SelectB->getCondition()) + return std::nullopt; + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", ContextInst=" + << *ContextInst << ", Depth=" << Depth << "\n"); + std::optional<APInt> TrueDiff = getConstantOffset( + SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth); + if (!TrueDiff.has_value()) + return std::nullopt; + std::optional<APInt> FalseDiff = + getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(), + ContextInst, Depth); + if (TrueDiff == FalseDiff) + return TrueDiff; } } - return false; + return std::nullopt; } -void Vectorizer::reorder(Instruction *I) { - SmallPtrSet<Instruction *, 16> InstructionsToMove; - SmallVector<Instruction *, 16> Worklist; - - Worklist.push_back(I); - while (!Worklist.empty()) { - Instruction *IW = Worklist.pop_back_val(); - int NumOperands = IW->getNumOperands(); - for (int i = 0; i < NumOperands; i++) { - Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); - if (!IM || IM->getOpcode() == Instruction::PHI) - continue; - - // If IM is in another BB, no need to move it, because this pass only - // vectorizes instructions within one BB. - if (IM->getParent() != I->getParent()) - continue; - - if (!IM->comesBefore(I)) { - InstructionsToMove.insert(IM); - Worklist.push_back(IM); - } +EquivalenceClassMap +Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + EquivalenceClassMap Ret; + + auto getUnderlyingObject = [](const Value *Ptr) -> const Value * { + const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); + if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { + // The select's themselves are distinct instructions even if they share + // the same condition and evaluate to consecutive pointers for true and + // false values of the condition. Therefore using the select's themselves + // for grouping instructions would put consecutive accesses into different + // lists and they won't be even checked for being consecutive, and won't + // be vectorized. + return Sel->getCondition(); } - } + return ObjPtr; + }; - // All instructions to move should follow I. Start from I, not from begin(). - for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; - ++BBI) { - if (!InstructionsToMove.count(&*BBI)) + for (Instruction &I : make_range(Begin, End)) { + auto *LI = dyn_cast<LoadInst>(&I); + auto *SI = dyn_cast<StoreInst>(&I); + if (!LI && !SI) continue; - Instruction *IM = &*BBI; - --BBI; - IM->removeFromParent(); - IM->insertBefore(I); - } -} - -std::pair<BasicBlock::iterator, BasicBlock::iterator> -Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { - Instruction *C0 = Chain[0]; - BasicBlock::iterator FirstInstr = C0->getIterator(); - BasicBlock::iterator LastInstr = C0->getIterator(); - BasicBlock *BB = C0->getParent(); - unsigned NumFound = 0; - for (Instruction &I : *BB) { - if (!is_contained(Chain, &I)) + if ((LI && !LI->isSimple()) || (SI && !SI->isSimple())) continue; - ++NumFound; - if (NumFound == 1) { - FirstInstr = I.getIterator(); - } - if (NumFound == Chain.size()) { - LastInstr = I.getIterator(); - break; - } - } - - // Range is [first, last). - return std::make_pair(FirstInstr, ++LastInstr); -} - -void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { - SmallVector<Instruction *, 16> Instrs; - for (Instruction *I : Chain) { - Value *PtrOperand = getLoadStorePointerOperand(I); - assert(PtrOperand && "Instruction must have a pointer operand."); - Instrs.push_back(I); - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) - Instrs.push_back(GEP); - } - - // Erase instructions. - for (Instruction *I : Instrs) - if (I->use_empty()) - I->eraseFromParent(); -} - -std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> -Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, - unsigned ElementSizeBits) { - unsigned ElementSizeBytes = ElementSizeBits / 8; - unsigned SizeBytes = ElementSizeBytes * Chain.size(); - unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; - if (NumLeft == Chain.size()) { - if ((NumLeft & 1) == 0) - NumLeft /= 2; // Split even in half - else - --NumLeft; // Split off last element - } else if (NumLeft == 0) - NumLeft = 1; - return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); -} - -ArrayRef<Instruction *> -Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { - // These are in BB order, unlike Chain, which is in address order. - SmallVector<Instruction *, 16> MemoryInstrs; - SmallVector<Instruction *, 16> ChainInstrs; - - bool IsLoadChain = isa<LoadInst>(Chain[0]); - LLVM_DEBUG({ - for (Instruction *I : Chain) { - if (IsLoadChain) - assert(isa<LoadInst>(I) && - "All elements of Chain must be loads, or all must be stores."); - else - assert(isa<StoreInst>(I) && - "All elements of Chain must be loads, or all must be stores."); - } - }); - - for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { - if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) { - ChainInstrs.push_back(&I); + if ((LI && !TTI.isLegalToVectorizeLoad(LI)) || + (SI && !TTI.isLegalToVectorizeStore(SI))) continue; - } - if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { - LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " - << I << '\n'); - break; - } - if (I.mayReadOrWriteMemory()) - MemoryInstrs.push_back(&I); - } - - // Loop until we find an instruction in ChainInstrs that we can't vectorize. - unsigned ChainInstrIdx = 0; - Instruction *BarrierMemoryInstr = nullptr; - - for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { - Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; - - // If a barrier memory instruction was found, chain instructions that follow - // will not be added to the valid prefix. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) - break; - // Check (in BB order) if any instruction prevents ChainInstr from being - // vectorized. Find and store the first such "conflicting" instruction. - for (Instruction *MemInstr : MemoryInstrs) { - // If a barrier memory instruction was found, do not check past it. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) - break; - - auto *MemLoad = dyn_cast<LoadInst>(MemInstr); - auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); - if (MemLoad && ChainLoad) - continue; - - // We can ignore the alias if the we have a load store pair and the load - // is known to be invariant. The load cannot be clobbered by the store. - auto IsInvariantLoad = [](const LoadInst *LI) -> bool { - return LI->hasMetadata(LLVMContext::MD_invariant_load); - }; - - if (IsLoadChain) { - // We can ignore the alias as long as the load comes before the store, - // because that means we won't be moving the load past the store to - // vectorize it (the vectorized load is inserted at the location of the - // first load in the chain). - if (ChainInstr->comesBefore(MemInstr) || - (ChainLoad && IsInvariantLoad(ChainLoad))) - continue; - } else { - // Same case, but in reverse. - if (MemInstr->comesBefore(ChainInstr) || - (MemLoad && IsInvariantLoad(MemLoad))) - continue; - } - - ModRefInfo MR = - AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr)); - if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { - LLVM_DEBUG({ - dbgs() << "LSV: Found alias:\n" - " Aliasing instruction:\n" - << " " << *MemInstr << '\n' - << " Aliased instruction and pointer:\n" - << " " << *ChainInstr << '\n' - << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; - }); - // Save this aliasing memory instruction as a barrier, but allow other - // instructions that precede the barrier to be vectorized with this one. - BarrierMemoryInstr = MemInstr; - break; - } - } - // Continue the search only for store chains, since vectorizing stores that - // precede an aliasing load is valid. Conversely, vectorizing loads is valid - // up to an aliasing store, but should not pull loads from further down in - // the basic block. - if (IsLoadChain && BarrierMemoryInstr) { - // The BarrierMemoryInstr is a store that precedes ChainInstr. - assert(BarrierMemoryInstr->comesBefore(ChainInstr)); - break; - } - } - - // Find the largest prefix of Chain whose elements are all in - // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of - // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB - // order.) - SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( - ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); - unsigned ChainIdx = 0; - for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { - if (!VectorizableChainInstrs.count(Chain[ChainIdx])) - break; - } - return Chain.slice(0, ChainIdx); -} - -static ChainID getChainID(const Value *Ptr) { - const Value *ObjPtr = getUnderlyingObject(Ptr); - if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { - // The select's themselves are distinct instructions even if they share the - // same condition and evaluate to consecutive pointers for true and false - // values of the condition. Therefore using the select's themselves for - // grouping instructions would put consecutive accesses into different lists - // and they won't be even checked for being consecutive, and won't be - // vectorized. - return Sel->getCondition(); - } - return ObjPtr; -} - -std::pair<InstrListMap, InstrListMap> -Vectorizer::collectInstructions(BasicBlock *BB) { - InstrListMap LoadRefs; - InstrListMap StoreRefs; - - for (Instruction &I : *BB) { - if (!I.mayReadOrWriteMemory()) + Type *Ty = getLoadStoreType(&I); + if (!VectorType::isValidElementType(Ty->getScalarType())) continue; - if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { - if (!LI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeLoad(LI)) - continue; - - Type *Ty = LI->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - Value *Ptr = LI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast<VectorType>(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save the load locations. - const ChainID ID = getChainID(Ptr); - LoadRefs[ID].push_back(LI); - } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { - if (!SI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeStore(SI)) - continue; - - Type *Ty = SI->getValueOperand()->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - Value *Ptr = SI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast<VectorType>(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save store location. - const ChainID ID = getChainID(Ptr); - StoreRefs[ID].push_back(SI); - } - } - - return {LoadRefs, StoreRefs}; -} - -bool Vectorizer::vectorizeChains(InstrListMap &Map) { - bool Changed = false; - - for (const std::pair<ChainID, InstrList> &Chain : Map) { - unsigned Size = Chain.second.size(); - if (Size < 2) + // Skip weird non-byte sizes. They probably aren't worth the effort of + // handling correctly. + unsigned TySize = DL.getTypeSizeInBits(Ty); + if ((TySize % 8) != 0) continue; - LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); - - // Process the stores in chunks of 64. - for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { - unsigned Len = std::min<unsigned>(CE - CI, 64); - ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); - Changed |= vectorizeInstructions(Chunk); - } - } - - return Changed; -} - -bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { - LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() - << " instructions.\n"); - SmallVector<int, 16> Heads, Tails; - int ConsecutiveChain[64]; - - // Do a quadratic search on all of the given loads/stores and find all of the - // pairs of loads/stores that follow each other. - for (int i = 0, e = Instrs.size(); i < e; ++i) { - ConsecutiveChain[i] = -1; - for (int j = e - 1; j >= 0; --j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Instrs[i], Instrs[j])) { - if (ConsecutiveChain[i] != -1) { - int CurDistance = std::abs(ConsecutiveChain[i] - i); - int NewDistance = std::abs(ConsecutiveChain[i] - j); - if (j < i || NewDistance > CurDistance) - continue; // Should not insert. - } + // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain + // functions are currently using an integer type for the vectorized + // load/store, and does not support casting between the integer type and a + // vector of pointers (e.g. i64 to <2 x i16*>) + if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) + continue; - Tails.push_back(j); - Heads.push_back(i); - ConsecutiveChain[i] = j; - } - } - } + Value *Ptr = getLoadStorePointerOperand(&I); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - bool Changed = false; - SmallPtrSet<Instruction *, 16> InstructionsProcessed; + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast<VectorType>(Ty); - for (int Head : Heads) { - if (InstructionsProcessed.count(Instrs[Head])) + // Only handle power-of-two sized elements. + if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) || + (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType())))) continue; - bool LongerChainExists = false; - for (unsigned TIt = 0; TIt < Tails.size(); TIt++) - if (Head == Tails[TIt] && - !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - LongerChainExists = true; - break; - } - if (LongerChainExists) - continue; - - // We found an instr that starts a chain. Now follow the chain and try to - // vectorize it. - SmallVector<Instruction *, 16> Operands; - int I = Head; - while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { - if (InstructionsProcessed.count(Instrs[I])) - break; - - Operands.push_back(Instrs[I]); - I = ConsecutiveChain[I]; - } - bool Vectorized = false; - if (isa<LoadInst>(*Operands.begin())) - Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); - else - Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); + // No point in looking at these if they're too big to vectorize. + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) + continue; - Changed |= Vectorized; + Ret[{getUnderlyingObject(Ptr), AS, + DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), + /*IsLoad=*/LI != nullptr}] + .push_back(&I); } - return Changed; + return Ret; } -bool Vectorizer::vectorizeStoreChain( - ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { - StoreInst *S0 = cast<StoreInst>(Chain[0]); - - // If the vector has an int element, default to int for the whole store. - Type *StoreTy = nullptr; - for (Instruction *I : Chain) { - StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); - if (StoreTy->isIntOrIntVectorTy()) - break; - - if (StoreTy->isPtrOrPtrVectorTy()) { - StoreTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(StoreTy)); - break; - } - } - assert(StoreTy && "Failed to find store type"); +std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) { + if (Instrs.empty()) + return {}; - unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned AS = S0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = S0->getAlign(); + unsigned AS = getLoadStoreAddressSpace(Instrs[0]); + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; +#ifndef NDEBUG + // Check that Instrs is in BB order and all have the same addr space. + for (size_t I = 1; I < Instrs.size(); ++I) { + assert(Instrs[I - 1]->comesBefore(Instrs[I])); + assert(getLoadStoreAddressSpace(Instrs[I]) == AS); } +#endif - ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } - - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - - FixedVectorType *VecTy; - auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); - if (VecStoreTy) - VecTy = FixedVectorType::get(StoreTy->getScalarType(), - Chain.size() * VecStoreTy->getNumElements()); - else - VecTy = FixedVectorType::get(StoreTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; - } - - LLVM_DEBUG({ - dbgs() << "LSV: Stores to vectorize:\n"; - for (Instruction *I : Chain) - dbgs() << " " << *I << "\n"; - }); - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the store is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; + // Machinery to build an MRU-hashtable of Chains. + // + // (Ideally this could be done with MapVector, but as currently implemented, + // moving an element to the front of a MapVector is O(n).) + struct InstrListElem : ilist_node<InstrListElem>, + std::pair<Instruction *, Chain> { + explicit InstrListElem(Instruction *I) + : std::pair<Instruction *, Chain>(I, {}) {} + }; + struct InstrListElemDenseMapInfo { + using PtrInfo = DenseMapInfo<InstrListElem *>; + using IInfo = DenseMapInfo<Instruction *>; + static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); } + static InstrListElem *getTombstoneKey() { + return PtrInfo::getTombstoneKey(); } - - Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, S0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; - } - - if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; - } - - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*Last); - - Value *Vec = PoisonValue::get(VecTy); - - if (VecStoreTy) { - unsigned VecWidth = VecStoreTy->getNumElements(); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast<StoreInst>(Chain[I]); - for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { - unsigned NewIdx = J + I * VecWidth; - Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), - Builder.getInt32(J)); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); - Vec = Insert; - } + static unsigned getHashValue(const InstrListElem *E) { + return IInfo::getHashValue(E->first); } - } else { - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast<StoreInst>(Chain[I]); - Value *Extract = Store->getValueOperand(); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = - Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); - Vec = Insert; + static bool isEqual(const InstrListElem *A, const InstrListElem *B) { + if (A == getEmptyKey() || B == getEmptyKey()) + return A == getEmptyKey() && B == getEmptyKey(); + if (A == getTombstoneKey() || B == getTombstoneKey()) + return A == getTombstoneKey() && B == getTombstoneKey(); + return IInfo::isEqual(A->first, B->first); } - } - - StoreInst *SI = Builder.CreateAlignedStore( - Vec, - Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), - Alignment); - propagateMetadata(SI, Chain); - - eraseInstructions(Chain); - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::vectorizeLoadChain( - ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { - LoadInst *L0 = cast<LoadInst>(Chain[0]); - - // If the vector has an int element, default to int for the whole load. - Type *LoadTy = nullptr; - for (const auto &V : Chain) { - LoadTy = cast<LoadInst>(V)->getType(); - if (LoadTy->isIntOrIntVectorTy()) - break; - - if (LoadTy->isPtrOrPtrVectorTy()) { - LoadTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(LoadTy)); - break; + }; + SpecificBumpPtrAllocator<InstrListElem> Allocator; + simple_ilist<InstrListElem> MRU; + DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains; + + // Compare each instruction in `instrs` to leader of the N most recently-used + // chains. This limits the O(n^2) behavior of this pass while also allowing + // us to build arbitrarily long chains. + for (Instruction *I : Instrs) { + constexpr int MaxChainsToTry = 64; + + bool MatchFound = false; + auto ChainIter = MRU.begin(); + for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end(); + ++J, ++ChainIter) { + std::optional<APInt> Offset = getConstantOffset( + getLoadStorePointerOperand(ChainIter->first), + getLoadStorePointerOperand(I), + /*ContextInst=*/ + (ChainIter->first->comesBefore(I) ? I : ChainIter->first)); + if (Offset.has_value()) { + // `Offset` might not have the expected number of bits, if e.g. AS has a + // different number of bits than opaque pointers. + ChainIter->second.push_back(ChainElem{I, Offset.value()}); + // Move ChainIter to the front of the MRU list. + MRU.remove(*ChainIter); + MRU.push_front(*ChainIter); + MatchFound = true; + break; + } } - } - assert(LoadTy && "Can't determine LoadInst type from chain"); - - unsigned Sz = DL.getTypeSizeInBits(LoadTy); - unsigned AS = L0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = L0->getAlign(); - - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - - ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - VectorType *VecTy; - auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); - if (VecLoadTy) - VecTy = FixedVectorType::get(LoadTy->getScalarType(), - Chain.size() * VecLoadTy->getNumElements()); - else - VecTy = FixedVectorType::get(LoadTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; - } - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the load is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; + if (!MatchFound) { + APInt ZeroOffset(ASPtrBits, 0); + InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I); + E->second.push_back(ChainElem{I, ZeroOffset}); + MRU.push_front(*E); + Chains.insert(E); } - - Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, L0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; } - if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; - } + std::vector<Chain> Ret; + Ret.reserve(Chains.size()); + // Iterate over MRU rather than Chains so the order is deterministic. + for (auto &E : MRU) + if (E.second.size() > 1) + Ret.push_back(std::move(E.second)); + return Ret; +} - LLVM_DEBUG({ - dbgs() << "LSV: Loads to vectorize:\n"; - for (Instruction *I : Chain) - I->dump(); - }); +std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst + << ", Depth=" << Depth << "\n"); + // We'll ultimately return a value of this bit width, even if computations + // happen in a different width. + unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); + APInt OffsetA(OrigBitWidth, 0); + APInt OffsetB(OrigBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + return std::nullopt; - // getVectorizablePrefix already computed getBoundaryInstrs. The value of - // Last may have changed since then, but the value of First won't have. If it - // matters, we could compute getBoundaryInstrs only once and reuse it here. - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*First); - - Value *Bitcast = - Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - LoadInst *LI = - Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); - propagateMetadata(LI, Chain); - - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *CV = Chain[I]; - Value *V; - if (VecLoadTy) { - // Extract a subvector using shufflevector. - unsigned VecWidth = VecLoadTy->getNumElements(); - auto Mask = - llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth)); - V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); - } else { - V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); - } + // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets + // should properly handle a possible overflow and the value should fit into + // the smallest data type used in the cast/gep chain. + assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && + OffsetB.getSignificantBits() <= NewPtrBitWidth); - if (V->getType() != CV->getType()) { - V = Builder.CreateBitOrPointerCast(V, CV->getType()); + OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); + OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); + if (PtrA == PtrB) + return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth); + + // Try to compute B - A. + const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA)); + if (DistScev != SE.getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n"); + ConstantRange DistRange = SE.getSignedRange(DistScev); + if (DistRange.isSingleElement()) { + // Handle index width (the width of Dist) != pointer width (the width of + // the Offset*s at this point). + APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth); + return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth); } - - // Replace the old instruction. - CV->replaceAllUsesWith(V); } - - // Since we might have opaque pointers we might end up using the pointer - // operand of the first load (wrt. memory loaded) for the vector load. Since - // this first load might not be the first in the block we potentially need to - // reorder the pointer operand (and its operands). If we have a bitcast though - // it might be before the load and should be the reorder start instruction. - // "Might" because for opaque pointers the "bitcast" is just the first loads - // pointer operand, as oppposed to something we inserted at the right position - // ourselves. - Instruction *BCInst = dyn_cast<Instruction>(Bitcast); - reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI); - - eraseInstructions(Chain); - - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed) { - RelativeSpeed = 0; - if (Alignment.value() % SzInBytes == 0) - return false; - - bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), - SzInBytes * 8, AddressSpace, - Alignment, &RelativeSpeed); - LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows - << " with relative speed = " << RelativeSpeed << '\n';); - return !Allows || !RelativeSpeed; + std::optional<APInt> Diff = + getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth); + if (Diff.has_value()) + return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth())) + .sextOrTrunc(OrigBitWidth); + return std::nullopt; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index cd48c0d57eb3..f923f0be6621 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -37,6 +37,11 @@ static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); +static cl::opt<bool> +AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, + cl::desc("Enable recognition of non-constant strided " + "pointer induction variables.")); + namespace llvm { cl::opt<bool> HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, @@ -447,8 +452,12 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { - const ValueToValueMap &Strides = - getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); + // FIXME: Currently, the set of symbolic strides is sometimes queried before + // it's collected. This happens from canVectorizeWithIfConvert, when the + // pointer is checked to reference consecutive elements suitable for a + // masked access. + const auto &Strides = + LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); Function *F = TheLoop->getHeader()->getParent(); bool OptForSize = F->hasOptSize() || @@ -462,11 +471,135 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, return 0; } -bool LoopVectorizationLegality::isUniform(Value *V) const { - return LAI->isUniform(V); +bool LoopVectorizationLegality::isInvariant(Value *V) const { + return LAI->isInvariant(V); +} + +namespace { +/// A rewriter to build the SCEVs for each of the VF lanes in the expected +/// vectorized loop, which can then be compared to detect their uniformity. This +/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) +/// with new AddRecs where the step is multiplied by StepMultiplier and Offset * +/// Step is added. Also checks if all sub-expressions are analyzable w.r.t. +/// uniformity. +class SCEVAddRecForUniformityRewriter + : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { + /// Multiplier to be applied to the step of AddRecs in TheLoop. + unsigned StepMultiplier; + + /// Offset to be added to the AddRecs in TheLoop. + unsigned Offset; + + /// Loop for which to rewrite AddRecsFor. + Loop *TheLoop; + + /// Is any sub-expressions not analyzable w.r.t. uniformity? + bool CannotAnalyze = false; + + bool canAnalyze() const { return !CannotAnalyze; } + +public: + SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, + unsigned Offset, Loop *TheLoop) + : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), + TheLoop(TheLoop) {} + + const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { + assert(Expr->getLoop() == TheLoop && + "addrec outside of TheLoop must be invariant and should have been " + "handled earlier"); + // Build a new AddRec by multiplying the step by StepMultiplier and + // incrementing the start by Offset * step. + Type *Ty = Expr->getType(); + auto *Step = Expr->getStepRecurrence(SE); + if (!SE.isLoopInvariant(Step, TheLoop)) { + CannotAnalyze = true; + return Expr; + } + auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); + auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); + auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); + return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); + } + + const SCEV *visit(const SCEV *S) { + if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop)) + return S; + return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); + } + + const SCEV *visitUnknown(const SCEVUnknown *S) { + if (SE.isLoopInvariant(S, TheLoop)) + return S; + // The value could vary across iterations. + CannotAnalyze = true; + return S; + } + + const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { + // Could not analyze the expression. + CannotAnalyze = true; + return S; + } + + static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, + unsigned StepMultiplier, unsigned Offset, + Loop *TheLoop) { + /// Bail out if the expression does not contain an UDiv expression. + /// Uniform values which are not loop invariant require operations to strip + /// out the lowest bits. For now just look for UDivs and use it to avoid + /// re-writing UDIV-free expressions for other lanes to limit compile time. + if (!SCEVExprContains(S, + [](const SCEV *S) { return isa<SCEVUDivExpr>(S); })) + return SE.getCouldNotCompute(); + + SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, + TheLoop); + const SCEV *Result = Rewriter.visit(S); + + if (Rewriter.canAnalyze()) + return Result; + return SE.getCouldNotCompute(); + } +}; + +} // namespace + +bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { + if (isInvariant(V)) + return true; + if (VF.isScalable()) + return false; + if (VF.isScalar()) + return true; + + // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is + // never considered uniform. + auto *SE = PSE.getSE(); + if (!SE->isSCEVable(V->getType())) + return false; + const SCEV *S = SE->getSCEV(V); + + // Rewrite AddRecs in TheLoop to step by VF and check if the expression for + // lane 0 matches the expressions for all other lanes. + unsigned FixedVF = VF.getKnownMinValue(); + const SCEV *FirstLaneExpr = + SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop); + if (isa<SCEVCouldNotCompute>(FirstLaneExpr)) + return false; + + // Make sure the expressions for lanes FixedVF-1..1 match the expression for + // lane 0. We check lanes in reverse order for compile-time, as frequently + // checking the last lane is sufficient to rule out uniformity. + return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) { + const SCEV *IthLaneExpr = + SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop); + return FirstLaneExpr == IthLaneExpr; + }); } -bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const { +bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, + ElementCount VF) const { Value *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) return false; @@ -474,7 +607,7 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const { // stores from being uniform. The current lowering simply doesn't handle // it; in particular, the cost model distinguishes scatter/gather from // scalar w/predication, and we currently rely on the scalar path. - return isUniform(Ptr) && !blockNeedsPredication(I.getParent()); + return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); } bool LoopVectorizationLegality::canVectorizeOuterLoop() { @@ -700,6 +833,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + // We prevent matching non-constant strided pointer IVS to preserve + // historical vectorizer behavior after a generalization of the + // IVDescriptor code. The intent is to remove this check, but we + // have to fix issues around code quality for such loops first. + auto isDisallowedStridedPointerInduction = + [](const InductionDescriptor &ID) { + if (AllowStridedPointerIVs) + return false; + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + ID.getConstIntStepValue() == nullptr; + }; + // TODO: Instead of recording the AllowedExit, it would be good to // record the complementary set: NotAllowedExit. These include (but may // not be limited to): @@ -715,14 +860,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // By recording these, we can then reason about ways to vectorize each // of these NotAllowedExit. InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && + !isDisallowedStridedPointerInduction(ID)) { addInductionPhi(Phi, ID, AllowedExit); Requirements->addExactFPMathInst(ID.getExactFPMathInst()); continue; } - if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, - SinkAfter, DT)) { + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { AllowedExit.insert(Phi); FixedOrderRecurrences.insert(Phi); continue; @@ -730,7 +875,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // As a last resort, coerce the PHI to a AddRec expression // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && + !isDisallowedStridedPointerInduction(ID)) { addInductionPhi(Phi, ID, AllowedExit); continue; } @@ -894,18 +1040,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } - // For fixed order recurrences, we use the previous value (incoming value from - // the latch) to check if it dominates all users of the recurrence. Bail out - // if we have to sink such an instruction for another recurrence, as the - // dominance requirement may not hold after sinking. - BasicBlock *LoopLatch = TheLoop->getLoopLatch(); - if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { - Instruction *V = - cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); - return SinkAfter.find(V) != SinkAfter.end(); - })) - return false; - // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. @@ -1124,6 +1258,16 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (isa<NoAliasScopeDeclInst>(&I)) continue; + // We can allow masked calls if there's at least one vector variant, even + // if we end up scalarizing due to the cost model calculations. + // TODO: Allow other calls if they have appropriate attributes... readonly + // and argmemonly? + if (CallInst *CI = dyn_cast<CallInst>(&I)) + if (VFDatabase::hasMaskedVariant(*CI)) { + MaskedOp.insert(CI); + continue; + } + // Loads are handled via masking (or speculated if safe to do so.) if (auto *LI = dyn_cast<LoadInst>(&I)) { if (!SafePtrs.count(LI->getPointerOperand())) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 8990a65afdb4..13357cb06c55 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -25,6 +25,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H #include "VPlan.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/InstructionCost.h" namespace llvm { @@ -217,6 +218,16 @@ struct VectorizationFactor { } }; +/// ElementCountComparator creates a total ordering for ElementCount +/// for the purposes of using it in a set structure. +struct ElementCountComparator { + bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { + return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < + std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); + } +}; +using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; + /// A class that represents two vectorization factors (initialized with 0 by /// default). One for fixed-width vectorization and one for scalable /// vectorization. This can be used by the vectorizer to choose from a range of @@ -261,7 +272,7 @@ class LoopVectorizationPlanner { const TargetLibraryInfo *TLI; /// Target Transform Info. - const TargetTransformInfo *TTI; + const TargetTransformInfo &TTI; /// The legality analysis. LoopVectorizationLegality *Legal; @@ -280,12 +291,15 @@ class LoopVectorizationPlanner { SmallVector<VPlanPtr, 4> VPlans; + /// Profitable vector factors. + SmallVector<VectorizationFactor, 8> ProfitableVFs; + /// A builder used to construct the current plan. VPBuilder Builder; public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, + const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, @@ -311,16 +325,22 @@ public: /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue /// vectorization re-using plans for both the main and epilogue vector loops. /// It should be removed once the re-use issue has been fixed. - void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, - InnerLoopVectorizer &LB, DominatorTree *DT, - bool IsEpilogueVectorization); + /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop + /// to re-use expansion results generated during main plan execution. Returns + /// a mapping of SCEVs to their expanded IR values. Note that this is a + /// temporary workaround needed due to the current epilogue handling. + DenseMap<const SCEV *, Value *> + executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, + InnerLoopVectorizer &LB, DominatorTree *DT, + bool IsEpilogueVectorization, + DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); #endif - /// Look through the existing plans and return true if we have one with all - /// the vectorization factors in question. + /// Look through the existing plans and return true if we have one with + /// vectorization factor \p VF. bool hasPlanWithVF(ElementCount VF) const { return any_of(VPlans, [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); @@ -333,8 +353,11 @@ public: getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, VFRange &Range); - /// Check if the number of runtime checks exceeds the threshold. - bool requiresTooManyRuntimeChecks() const; + /// \return The most profitable vectorization factor and the cost of that VF + /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if + /// epilogue vectorization is not supported for the loop. + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC); protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, @@ -350,9 +373,12 @@ private: /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, - const MapVector<Instruction *, Instruction *> &SinkAfter); + /// \p Range's largest included VF is restricted to the maximum VF the + /// returned VPlan is valid for. If no VPlan can be built for the input range, + /// set the largest included VF to the maximum VF for which no plan could be + /// built. + std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -367,6 +393,20 @@ private: void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in \p CandidateVFs. + VectorizationFactor + selectVectorizationFactor(const ElementCountSet &CandidateVFs); + + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + + /// Determines if we have the infrastructure to vectorize the loop and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const ElementCount VF) const; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a28099d8ba7d..d7e40e8ef978 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -98,6 +98,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" @@ -120,8 +121,6 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -231,6 +230,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); +static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( + "force-tail-folding-style", cl::desc("Force the tail folding style"), + cl::init(TailFoldingStyle::None), + cl::values( + clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), + clEnumValN( + TailFoldingStyle::Data, "data", + "Create lane mask for data only, using active.lane.mask intrinsic"), + clEnumValN(TailFoldingStyle::DataWithoutLaneMask, + "data-without-lane-mask", + "Create lane mask with compare/stepvector"), + clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", + "Create lane mask using active.lane.mask intrinsic, and use " + "it for both data and control flow"), + clEnumValN( + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, + "data-and-control-without-rt-check", + "Similar to data-and-control, but remove the runtime check"))); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -338,10 +356,12 @@ static cl::opt<bool> PreferPredicatedReductionSelect( cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +namespace llvm { cl::opt<bool> EnableVPlanNativePath( - "enable-vplan-native-path", cl::init(false), cl::Hidden, + "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); +} // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with @@ -419,9 +439,42 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, return std::nullopt; } +/// Return a vector containing interleaved elements from multiple +/// smaller input vectors. +static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, + const Twine &Name) { + unsigned Factor = Vals.size(); + assert(Factor > 1 && "Tried to interleave invalid number of vectors"); + + VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); +#ifndef NDEBUG + for (Value *Val : Vals) + assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); +#endif + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to interleave. + if (VecTy->isScalableTy()) { + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + return Builder.CreateIntrinsic( + WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, + /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Start by concatenating all vectors into a wide vector. + Value *WideVec = concatenateVectors(Builder, Vals); + + // Interleave the elements into the wide vector. + const unsigned NumElts = VecTy->getElementCount().getFixedValue(); + return Builder.CreateShuffleVector( + WideVec, createInterleaveMask(NumElts, Factor), Name); +} + namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; + +using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; } // namespace namespace llvm { @@ -477,8 +530,10 @@ public: /// loop and the start value for the canonical induction, if it is != 0. The /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more - /// complex control flow around the loops. - virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); + /// complex control flow around the loops. \p ExpandedSCEVs is used to + /// look up SCEV expansions for expressions needed during skeleton creation. + virtual std::pair<BasicBlock *, Value *> + createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -498,7 +553,7 @@ public: /// Instr's operands. void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, - const VPIteration &Instance, bool IfPredicateInstr, + const VPIteration &Instance, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -513,7 +568,7 @@ public: ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask = nullptr); + VPValue *BlockInMask, bool NeedsMaskForGaps); /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); @@ -522,28 +577,30 @@ public: /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - virtual Value *getBroadcastInstrs(Value *V); - // Returns the resume value (bc.merge.rdx) for a reduction as // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop - /// left off. In cases where the loop skeleton is more complicated (eg. - /// epilogue vectorization) and the resume values can come from an additional - /// bypass block, the \p AdditionalBypass pair provides information about the - /// bypass block and the end value on the edge from bypass to this loop. + /// left off. \p Step is the SCEV-expanded induction step to use. In cases + /// where the loop skeleton is more complicated (i.e., epilogue vectorization) + /// and the resume values can come from an additional bypass block, the \p + /// AdditionalBypass pair provides information about the bypass block and the + /// end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &ID, + PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef<BasicBlock *> BypassBlocks, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + /// Returns the original loop trip count. + Value *getTripCount() const { return TripCount; } + + /// Used to set the trip count after ILV's construction and after the + /// preheader block has been executed. Note that this always holds the trip + /// count of the original loop for both main loop and epilogue vectorization. + void setTripCount(Value *TC) { TripCount = TC; } + protected: friend class LoopVectorizationPlanner; @@ -560,7 +617,7 @@ protected: void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, - VPlan &Plan); + VPlan &Plan, VPTransformState &State); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -573,10 +630,6 @@ protected: /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); - /// Clear NSW/NUW flags from reduction instructions if necessary. - void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, - VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -585,9 +638,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(BasicBlock *InsertBlock); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -621,6 +671,7 @@ protected: /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( + const SCEV2ValueTy &ExpandedSCEVs, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -758,9 +809,6 @@ public: ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} - -private: - Value *getBroadcastInstrs(Value *V) override; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -810,15 +858,16 @@ public: // Override this function to handle the more complex control flow around the // three loops. - std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); + std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) final { + return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair<BasicBlock *, Value *> - createEpilogueVectorizedLoopSkeleton() = 0; + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -846,7 +895,8 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -876,7 +926,8 @@ public: } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -953,35 +1004,21 @@ namespace llvm { Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); - Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); - return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; + return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); } /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { - Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); - return VF.isScalable() ? B.CreateVScale(EC) : EC; + return B.CreateElementCount(Ty, VF); } -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) { +const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, + Loop *OrigLoop) { const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) > - IdxTy->getPrimitiveSizeInBits()) - BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy); - BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); - - // Get the total trip count from the count by adding 1. - return SE.getAddExpr(BackedgeTakenCount, - SE.getOne(BackedgeTakenCount->getType())); + return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); } static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, @@ -1062,11 +1099,17 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( continue; // This recipe contributes to the address computation of a widen - // load/store. Collect recipe if its underlying instruction has - // poison-generating flags. - Instruction *Instr = CurRec->getUnderlyingInstr(); - if (Instr && Instr->hasPoisonGeneratingFlags()) - State.MayGeneratePoisonRecipes.insert(CurRec); + // load/store. If the underlying instruction has poison-generating flags, + // drop them directly. + if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { + RecWithFlags->dropPoisonGeneratingFlags(); + } else { + Instruction *Instr = CurRec->getUnderlyingInstr(); + (void)Instr; + assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && + "found instruction with poison generating flags not covered by " + "VPRecipeWithIRFlags"); + } // Add new definitions to the worklist. for (VPValue *operand : CurRec->operands()) @@ -1143,15 +1186,7 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedUsePredicate }; -/// ElementCountComparator creates a total ordering for ElementCount -/// for the purposes of using it in a set structure. -struct ElementCountComparator { - bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { - return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < - std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); - } -}; -using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; +using InstructionVFPair = std::pair<Instruction *, ElementCount>; /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. @@ -1184,17 +1219,6 @@ public: /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1278,11 +1302,17 @@ public: auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); - return Scalars->second.find(I) != Scalars->second.end(); + return Scalars->second.contains(I); } /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + // Pseudo probe needs to be duplicated for each unrolled iteration and + // vector lane so that profiled loop trip count can be accurately + // accumulated instead of being under counted. + if (isa<PseudoProbeInst>(I)) + return false; + if (VF.isScalar()) return true; @@ -1316,7 +1346,7 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { - return VF.isVector() && MinBWs.find(I) != MinBWs.end() && + return VF.isVector() && MinBWs.contains(I) && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1379,7 +1409,7 @@ public: InstructionCost getWideningCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Expected VF >=2"); std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); - assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && + assert(WideningDecisions.contains(InstOnVF) && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; } @@ -1419,7 +1449,7 @@ public: /// that may be vectorized as interleave, gather-scatter or scalarized. void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.contains(VF)) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1442,8 +1472,7 @@ public: /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V, - ElementCount VF = ElementCount::getFixed(1)) { + bool isLegalGatherOrScatter(Value *V, ElementCount VF) { bool LI = isa<LoadInst>(V); bool SI = isa<StoreInst>(V); if (!LI && !SI) @@ -1522,14 +1551,29 @@ public: /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. - bool requiresScalarEpilogue(ElementCount VF) const { + bool requiresScalarEpilogue(bool IsVectorizing) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; - return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); + return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); + } + + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop for all VFs in \p Range. + /// A scalar epilogue must either be required for all VFs in \p Range or for + /// none. + bool requiresScalarEpilogue(VFRange Range) const { + auto RequiresScalarEpilogue = [this](ElementCount VF) { + return requiresScalarEpilogue(VF.isVector()); + }; + bool IsRequired = all_of(Range, RequiresScalarEpilogue); + assert( + (IsRequired || none_of(Range, RequiresScalarEpilogue)) && + "all VFs in range must agree on whether a scalar epilogue is required"); + return IsRequired; } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1538,14 +1582,21 @@ public: return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } - /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns the TailFoldingStyle that is best for the current loop. + TailFoldingStyle + getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { + if (!CanFoldTailByMasking) + return TailFoldingStyle::None; + + if (ForceTailFoldingStyle.getNumOccurrences()) + return ForceTailFoldingStyle; + + return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); + } - /// Returns true if were tail-folding and want to use the active lane mask - /// for vector loop control flow. - bool useActiveLaneMaskForControlFlow() const { - return FoldTailByMasking && - TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + /// Returns true if all loop blocks should be masked to fold tail loop. + bool foldTailByMasking() const { + return getTailFoldingStyle() != TailFoldingStyle::None; } /// Returns true if the instructions in this block requires predication @@ -1582,12 +1633,8 @@ public: /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const; - - /// Returns true if the per-lane cost of VectorizationFactor A is lower than - /// that of B. - bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; + Function **Variant, + bool *NeedsMask = nullptr) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1596,10 +1643,29 @@ public: Scalars.clear(); } - /// Convenience function that returns the value of vscale_range iff - /// vscale_range.min == vscale_range.max or otherwise returns the value - /// returned by the corresponding TLI method. - std::optional<unsigned> getVScaleForTuning() const; + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair<InstructionCost, bool>; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); + + bool hasPredStores() const { return NumPredStores > 0; } + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; private: unsigned NumPredStores = 0; @@ -1626,24 +1692,6 @@ private: /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair<InstructionCost, bool>; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - using InstructionVFPair = std::pair<Instruction *, ElementCount>; - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); @@ -1715,7 +1763,7 @@ private: ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; + bool CanFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -1796,8 +1844,7 @@ private: // the scalars are collected. That should be a safe assumption in most // cases, because we check if the operands have vectorizable types // beforehand in LoopVectorizationLegality. - return Scalars.find(VF) == Scalars.end() || - !isScalarAfterVectorization(I, VF); + return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); }; /// Returns a range containing only operands needing to be extracted. @@ -1807,16 +1854,6 @@ private: Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -1862,9 +1899,6 @@ public: /// All element types found in the loop. SmallPtrSet<Type *, 16> ElementTypesInLoop; - - /// Profitable vector factors. - SmallVector<VectorizationFactor, 8> ProfitableVFs; }; } // end namespace llvm @@ -2135,6 +2169,17 @@ public: }; } // namespace +static bool useActiveLaneMask(TailFoldingStyle Style) { + return Style == TailFoldingStyle::Data || + Style == TailFoldingStyle::DataAndControlFlow || + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; +} + +static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { + return Style == TailFoldingStyle::DataAndControlFlow || + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; +} + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2202,97 +2247,11 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, collectSupportedLoops(*InnerL, LI, ORE, V); } -namespace { - -/// The LoopVectorize Pass. -struct LoopVectorize : public FunctionPass { - /// Pass identification, replacement for typeid - static char ID; - - LoopVectorizePass Impl; - - explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, - bool VectorizeOnlyWhenForced = false) - : FunctionPass(ID), - Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { - initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; - auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); - auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); - auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - - return Impl - .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI) - .MadeAnyChange; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<BlockFrequencyInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<LoopAccessLegacyAnalysis>(); - AU.addRequired<DemandedBitsWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addRequired<InjectTLIMappingsLegacy>(); - - // We currently do not preserve loopinfo/dominator analyses with outer loop - // vectorization. Until this is addressed, mark these analyses as preserved - // only for non-VPlan-native path. - // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - } - - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addRequired<ProfileSummaryInfoWrapperPass>(); - } -}; - -} // end anonymous namespace - //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and // LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// -Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // We need to place the broadcast of invariant variables outside the loop, - // but only if it's proven safe to do so. Else, broadcast will be inside - // vector loop body. - Instruction *Instr = dyn_cast<Instruction>(V); - bool SafeToHoist = OrigLoop->isLoopInvariant(V) && - (!Instr || - DT->dominates(Instr->getParent(), LoopVectorPreHeader)); - // Place the code for broadcasting invariant variables in the new preheader. - IRBuilder<>::InsertPointGuard Guard(Builder); - if (SafeToHoist) - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - - // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - - return Shuf; -} - /// This function adds /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. @@ -2435,21 +2394,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, } } -// Generate code for the induction step. Note that induction steps are -// required to be loop-invariant -static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, - Instruction *InsertBefore, - Loop *OrigLoop = nullptr) { - const DataLayout &DL = SE.getDataLayout(); - assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && - "Induction step should be loop invariant"); - if (auto *E = dyn_cast<SCEVUnknown>(Step)) - return E->getValue(); - - SCEVExpander Exp(SE, DL, "induction"); - return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); -} - /// Compute the transformed value of Index at offset StartValue using step /// StepValue. /// For integer induction, returns StartValue + Index * StepValue. @@ -2514,9 +2458,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { - assert(isa<Constant>(Step) && - "Expected constant step for pointer induction"); - return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); + return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); } case InductionDescriptor::IK_FpInduction: { assert(!isa<VectorType>(Index->getType()) && @@ -2538,6 +2480,50 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, llvm_unreachable("invalid enum"); } +std::optional<unsigned> getMaxVScale(const Function &F, + const TargetTransformInfo &TTI) { + if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) + return MaxVScale; + + if (F.hasFnAttribute(Attribute::VScaleRange)) + return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); + + return std::nullopt; +} + +/// For the given VF and UF and maximum trip count computed for the loop, return +/// whether the induction variable might overflow in the vectorized loop. If not, +/// then we know a runtime overflow check always evaluates to false and can be +/// removed. +static bool isIndvarOverflowCheckKnownFalse( + const LoopVectorizationCostModel *Cost, + ElementCount VF, std::optional<unsigned> UF = std::nullopt) { + // Always be conservative if we don't know the exact unroll factor. + unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); + + Type *IdxTy = Cost->Legal->getWidestInductionType(); + APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); + + // We know the runtime overflow check is known false iff the (max) trip-count + // is known and (max) trip-count + (VF * UF) does not overflow in the type of + // the vector loop induction variable. + if (unsigned TC = + Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { + uint64_t MaxVF = VF.getKnownMinValue(); + if (VF.isScalable()) { + std::optional<unsigned> MaxVScale = + getMaxVScale(*Cost->TheFunction, Cost->TTI); + if (!MaxVScale) + return false; + MaxVF *= *MaxVScale; + } + + return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); + } + + return false; +} + void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State) { @@ -2591,14 +2577,13 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { void InnerLoopVectorizer::vectorizeInterleaveGroup( const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask) { + VPValue *BlockInMask, bool NeedsMaskForGaps) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2609,14 +2594,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); + Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + if (Group->isReverse()) { + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); + Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); + Idx = Builder.CreateNeg(Idx); + } else + Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); @@ -2637,8 +2629,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); - cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); // Cast to the vector pointer type. unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); @@ -2649,14 +2640,43 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( State.setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); - Value *MaskForGaps = nullptr; - if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); - assert(MaskForGaps && "Mask for Gaps is required but it is null"); - } + auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( + unsigned Part, Value *MaskForGaps) -> Value * { + if (VF.isScalable()) { + assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + auto *BlockInMaskPart = State.get(BlockInMask, Part); + SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; + auto *MaskTy = + VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); + return Builder.CreateIntrinsic( + MaskTy, Intrinsic::experimental_vector_interleave2, Ops, + /*FMFSource=*/nullptr, "interleaved.mask"); + } + + if (!BlockInMask) + return MaskForGaps; + + Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ShuffledMask = Builder.CreateShuffleVector( + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); + return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, + MaskForGaps) + : ShuffledMask; + }; // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { + Value *MaskForGaps = nullptr; + if (NeedsMaskForGaps) { + MaskForGaps = + createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { @@ -2664,18 +2684,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); - Value *GroupMask = MaskForGaps; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - GroupMask = MaskForGaps - ? Builder.CreateBinOp(Instruction::And, ShuffledMask, - MaskForGaps) - : ShuffledMask; - } + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewLoad = Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); @@ -2687,6 +2696,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( NewLoads.push_back(NewLoad); } + if (VecTy->isScalableTy()) { + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + + for (unsigned Part = 0; Part < UF; ++Part) { + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + Value *DI = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; + + Value *StridedVec = Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, Part); + ++J; + } + } + + return; + } + // For each member in the group, shuffle out the appropriate data from the // wide loads. unsigned J = 0; @@ -2724,7 +2768,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. - MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + Value *MaskForGaps = + createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && "masked interleaved groups are not allowed."); assert((!MaskForGaps || !VF.isScalable()) && @@ -2759,27 +2804,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( StoredVecs.push_back(StoredVec); } - // Concatenate all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, StoredVecs); - - // Interleave the elements in the wide vector. - Value *IVec = Builder.CreateShuffleVector( - WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), - "interleaved.vec"); - + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { - Value *GroupMask = MaskForGaps; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, - ShuffledMask, MaskForGaps) - : ShuffledMask; - } + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], Group->getAlign(), GroupMask); } else @@ -2793,7 +2822,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, - bool IfPredicateInstr, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); @@ -2810,14 +2838,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // If the scalarized instruction contributes to the address computation of a - // widen masked load/store which was in a basic block that needed predication - // and is not predicated after vectorization, we can't propagate - // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized - // instruction could feed a poison value to the base address of the widen - // load/store. - if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) - Cloned->dropPoisonGeneratingFlags(); + RepRecipe->setFlags(Cloned); if (Instr->getDebugLoc()) State.setDebugLocFromInst(Instr); @@ -2843,45 +2864,17 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, AC->registerAssumption(II); // End if-block. + bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } -Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { - if (TripCount) - return TripCount; - - assert(InsertBlock); - IRBuilder<> Builder(InsertBlock->getTerminator()); - // Find the loop boundaries. - Type *IdxTy = Legal->getWidestInductionType(); - assert(IdxTy && "No type for induction"); - const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); - - const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - - // Count holds the overall loop count (N). - TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - InsertBlock->getTerminator()); - - if (TripCount->getType()->isPointerTy()) - TripCount = - CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - InsertBlock->getTerminator()); - - return TripCount; -} - Value * InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(InsertBlock); + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); @@ -2917,7 +2910,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. - if (Cost->requiresScalarEpilogue(VF)) { + if (Cost->requiresScalarEpilogue(VF.isVector())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2930,10 +2923,10 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast<FixedVectorType>(DstVTy); - unsigned VF = DstFVTy->getNumElements(); - auto *SrcVecTy = cast<FixedVectorType>(V->getType()); - assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); + auto *DstFVTy = cast<VectorType>(DstVTy); + auto VF = DstFVTy->getElementCount(); + auto *SrcVecTy = cast<VectorType>(V->getType()); + assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && @@ -2953,13 +2946,13 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = FixedVectorType::get(IntTy, VF); + auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -2970,8 +2963,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); @@ -2989,10 +2982,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); }; - if (!Cost->foldTailByMasking()) + TailFoldingStyle Style = Cost->getTailFoldingStyle(); + if (Style == TailFoldingStyle::None) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); - else if (VF.isScalable()) { + else if (VF.isScalable() && + !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && + Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an // additional overflow check is required before entering the vector loop. @@ -3017,7 +3013,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // Update dominator for Bypass & LoopExit (if needed). DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3044,7 +3040,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, SCEVCheckBlock); - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3097,7 +3093,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr - assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && + assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?"); LoopMiddleBlock = @@ -3117,17 +3113,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { // branch from the middle block to the loop scalar preheader, and the // exit block. completeLoopSkeleton will update the condition to use an // iteration check, if required to decide whether to execute the remainder. - BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? - BranchInst::Create(LoopScalarPreHeader) : - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, - Builder.getTrue()); + BranchInst *BrInst = + Cost->requiresScalarEpilogue(VF.isVector()) + ? BranchInst::Create(LoopScalarPreHeader) + : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, + Builder.getTrue()); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Update dominator for loop exit. During skeleton creation, only the vector // pre-header and the middle block are created. The vector loop is entirely // created during VPlan exection. - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3135,7 +3132,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { } PHINode *InnerLoopVectorizer::createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &II, + PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, ArrayRef<BasicBlock *> BypassBlocks, std::pair<BasicBlock *, Value *> AdditionalBypass) { Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); @@ -3154,8 +3151,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); EndValue->setName("ind.end"); @@ -3163,8 +3158,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( // Compute the end value for the additional bypass (if applicable). if (AdditionalBypass.first) { B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); EndValueFromAdditionalBypass = emitTransformedIndex( B, AdditionalBypass.second, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); @@ -3193,7 +3186,22 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( return BCResumeVal; } +/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV +/// expansion results. +static Value *getExpandedStep(const InductionDescriptor &ID, + const SCEV2ValueTy &ExpandedSCEVs) { + const SCEV *Step = ID.getStep(); + if (auto *C = dyn_cast<SCEVConstant>(Step)) + return C->getValue(); + if (auto *U = dyn_cast<SCEVUnknown>(Step)) + return U->getValue(); + auto I = ExpandedSCEVs.find(Step); + assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); + return I->second; +} + void InnerLoopVectorizer::createInductionResumeValues( + const SCEV2ValueTy &ExpandedSCEVs, std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && @@ -3209,14 +3217,15 @@ void InnerLoopVectorizer::createInductionResumeValues( PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, LoopBypassBlocks, AdditionalBypass); + OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, + AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3229,7 +3238,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // Thus if tail is to be folded, we know we don't need to run the // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { + if (!Cost->requiresScalarEpilogue(VF.isVector()) && + !Cost->foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -3250,14 +3260,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { } std::pair<BasicBlock *, Value *> -InnerLoopVectorizer::createVectorizedLoopSkeleton() { +InnerLoopVectorizer::createVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- loop iteration number check. - / | + [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's + / | preheader are expanded here. Eventually all required SCEV + / | expansion should happen here. / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | @@ -3304,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(); + createInductionResumeValues(ExpandedSCEVs); return {completeLoopSkeleton(), nullptr}; } @@ -3317,7 +3329,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, - BasicBlock *VectorHeader, VPlan &Plan) { + BasicBlock *VectorHeader, VPlan &Plan, + VPTransformState &State) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). @@ -3345,7 +3358,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, auto *UI = cast<Instruction>(U); if (!OrigLoop->contains(UI)) { assert(isa<PHINode>(UI) && "Expected LCSSA form"); - IRBuilder<> B(MiddleBlock->getTerminator()); // Fast-math-flags propagate from the original induction instruction. @@ -3355,8 +3367,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, Value *CountMinusOne = B.CreateSub( VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); CountMinusOne->setName("cmo"); - Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), - VectorHeader->getTerminator()); + + VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); + assert(StepVPV && "step must have been expanded during VPlan execution"); + Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() + : State.get(StepVPV, {0, 0}); Value *Escape = emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); Escape->setName("ind.escape"); @@ -3430,12 +3445,12 @@ static void cse(BasicBlock *BB) { } } -InstructionCost -LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const { +InstructionCost LoopVectorizationCostModel::getVectorCallCost( + CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; + bool MaskRequired = Legal->isMaskRequired(CI); for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); @@ -3464,18 +3479,39 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. - NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + InstructionCost MaskCost = 0; + VFShape Shape = VFShape::get(*CI, VF, MaskRequired); + if (NeedsMask) + *NeedsMask = MaskRequired; Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the VF, + // maybe we can find vector function that does use a mask and synthesize + // an all-true mask. + if (!VecFunc && !MaskRequired) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we found one, add in the cost of creating a mask + if (VecFunc) { + if (NeedsMask) + *NeedsMask = true; + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get( + IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), + VF)); + } + } + // We don't support masked function calls yet, but we can scalarize a + // masked call with branches (unless VF is scalable). if (!TLI || CI->isNoBuiltin() || !VecFunc) - return Cost; + return VF.isScalable() ? InstructionCost::getInvalid() : Cost; // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; if (VectorCallCost < Cost) { - NeedToScalarize = false; + *Variant = VecFunc; Cost = VectorCallCost; } return Cost; @@ -3675,14 +3711,25 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + // After vectorization, the exit blocks of the original loop will have + // additional predecessors. Invalidate SCEVs for the exit phis in case SE + // looked through single-entry phis. + SmallVector<BasicBlock *> ExitBlocks; + OrigLoop->getExitBlocks(ExitBlocks); + for (BasicBlock *Exit : ExitBlocks) + for (PHINode &PN : Exit->phis()) + PSE.getSE()->forgetValue(&PN); + VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); - if (Cost->requiresScalarEpilogue(VF)) { + if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - Plan.clearLiveOuts(); } else { + // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking + // the cost model. + // If we inserted an edge from the middle block to the unique exit block, // update uses outside the loop (phis) to account for the newly inserted // edge. @@ -3692,7 +3739,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, - VectorLoop->getHeader(), Plan); + VectorLoop->getHeader(), Plan, State); } // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated @@ -3799,31 +3846,53 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Value *Incoming = State.get(PreviousDef, UF - 1); auto *ExtractForScalar = Incoming; auto *IdxTy = Builder.getInt32Ty(); + Value *RuntimeVF = nullptr; if (VF.isVector()) { auto *One = ConstantInt::get(IdxTy, 1); Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); + RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, - "vector.recur.extract"); - } - // Extract the second last element in the middle block if the - // Phi is used outside the loop. We need to extract the phi itself - // and not the last element (the phi update in the current iteration). This - // will be the value when jumping to the exit block from the LoopMiddleBlock, - // when the scalar loop is not run at all. - Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) { - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); - auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); - ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Idx, "vector.recur.extract.for.phi"); - } else if (UF > 1) - // When loop is unrolled without vectorizing, initialize - // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value - // of `Incoming`. This is analogous to the vectorized case above: extracting - // the second last element when VF > 1. - ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); + ExtractForScalar = + Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); + } + + auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); + assert(PhiR->getNumUsers() == 1 && + RecurSplice->getOpcode() == + VPInstruction::FirstOrderRecurrenceSplice && + "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); + SmallVector<VPLiveOut *> LiveOuts; + for (VPUser *U : RecurSplice->users()) + if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) + LiveOuts.push_back(LiveOut); + + if (!LiveOuts.empty()) { + // Extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the + // LoopMiddleBlock, when the scalar loop is not run at all. + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF.isVector()) { + auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + Incoming, Idx, "vector.recur.extract.for.phi"); + } else { + assert(UF > 1 && "VF and UF cannot both be 1"); + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled + // value of `Incoming`. This is analogous to the vectorized case above: + // extracting the second last element when VF > 1. + ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); + } + + for (VPLiveOut *LiveOut : LiveOuts) { + assert(!Cost->requiresScalarEpilogue(VF.isVector())); + PHINode *LCSSAPhi = LiveOut->getPhi(); + LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + State.Plan->removeLiveOut(LCSSAPhi); + } + } // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); @@ -3837,22 +3906,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); Phi->setName("scalar.recur"); - - // Finally, fix users of the recurrence outside the loop. The users will need - // either the last value of the scalar recurrence or the last value of the - // vector recurrence we extracted in the middle block. Since the loop is in - // LCSSA form, we just need to find all the phi nodes for the original scalar - // recurrence in the exit block, and then add an edge for the middle block. - // Note that LCSSA does not imply single entry when the original scalar loop - // had multiple exiting edges (as we always run the last iteration in the - // scalar epilogue); in that case, there is no edge from middle to exit and - // and thus no phis which needed updated. - if (!Cost->requiresScalarEpilogue(VF)) - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { - LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - State.Plan->removeLiveOut(&LCSSAPhi); - } } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, @@ -3872,9 +3925,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); - // Wrap flags are in general invalid after vectorization, clear them. - clearReductionWrapFlags(PhiR, State); - // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement @@ -4036,7 +4086,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in // fixFixedOrderRecurrence for a more complete explaination of the logic. - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); @@ -4054,38 +4104,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } -void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, - VPTransformState &State) { - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RK != RecurKind::Add && RK != RecurKind::Mul) - return; - - SmallVector<VPValue *, 8> Worklist; - SmallPtrSet<VPValue *, 8> Visited; - Worklist.push_back(PhiR); - Visited.insert(PhiR); - - while (!Worklist.empty()) { - VPValue *Cur = Worklist.pop_back_val(); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = State.get(Cur, Part); - if (!isa<OverflowingBinaryOperator>(V)) - break; - cast<Instruction>(V)->dropPoisonGeneratingFlags(); - } - - for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast<VPRecipeBase>(U); - if (!UserRecipe) - continue; - for (VPValue *V : UserRecipe->definedValues()) - if (Visited.insert(V).second) - Worklist.push_back(V); - } - } -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -4125,10 +4143,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); // We can't sink an instruction if it is a phi node, is not in the loop, - // or may have side effects. + // may have side effects or may read from memory. + // TODO Could dor more granular checking to allow sinking a load past non-store instructions. if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || - I->mayHaveSideEffects()) - continue; + I->mayHaveSideEffects() || I->mayReadFromMemory()) + continue; // If the instruction is already in PredBB, check if we can sink its // operands. In that case, VPlan's sinkScalarOperands() succeeded in @@ -4189,7 +4208,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && !Scalars.contains(VF) && "This function should not be visited twice for the same VF"); // This avoids any chances of creating a REPLICATE recipe during planning @@ -4382,6 +4401,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication( switch(I->getOpcode()) { default: return true; + case Instruction::Call: + return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF); case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); @@ -4430,10 +4451,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // both speculation safety (which follows from the same argument as loads), // but also must prove the value being stored is correct. The easiest // form of the later is to require that all values stored are the same. - if (Legal->isUniformMemOp(*I) && - (isa<LoadInst>(I) || - (isa<StoreInst>(I) && - TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && + if (Legal->isInvariant(getLoadStorePointerOperand(I)) && + (isa<LoadInst>(I) || + (isa<StoreInst>(I) && + TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && !Legal->blockNeedsPredication(I->getParent())) return false; return true; @@ -4445,6 +4466,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // TODO: We can use the loop-preheader as context point here and get // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); + case Instruction::Call: + return Legal->isMaskRequired(I); } } @@ -4502,7 +4525,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); @@ -4614,7 +4638,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && !Uniforms.contains(VF) && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4663,10 +4687,18 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); + auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can // thus chose to execute only one. auto isUniformMemOpUse = [&](Instruction *I) { - if (!Legal->isUniformMemOp(*I)) + // If the value was already known to not be uniform for the previous + // (smaller VF), it cannot be uniform for the larger VF. + if (PrevVF.isVector()) { + auto Iter = Uniforms.find(PrevVF); + if (Iter != Uniforms.end() && !Iter->second.contains(I)) + return false; + } + if (!Legal->isUniformMemOp(*I, VF)) return false; if (isa<LoadInst>(I)) // Loading the same address always produces the same result - at least @@ -4689,11 +4721,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { WideningDecision == CM_Interleave); }; - // Returns true if Ptr is the pointer operand of a memory access instruction - // I, and I is known to not require scalarization. + // I, I is known to not require scalarization, and the pointer is not also + // stored. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) + return false; + return getLoadStorePointerOperand(I) == Ptr && + (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); }; // Holds a list of values which are known to have at least one uniform use. @@ -4739,10 +4774,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); - if (isUniformDecision(&I, VF)) { - assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); + if (isVectorizedMemAccessUse(&I, Ptr)) HasUniformUse.insert(Ptr); - } } // Add to the worklist any operands which have *only* uniform (e.g. lane 0 @@ -4906,12 +4939,11 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; // Limit MaxScalableVF by the maximum safe dependence distance. - std::optional<unsigned> MaxVScale = TTI.getMaxVScale(); - if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) - MaxVScale = - TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); - MaxScalableVF = - ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0); + if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) + MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); + else + MaxScalableVF = ElementCount::getScalable(0); + if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -4932,7 +4964,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeElements = - PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); @@ -5105,16 +5137,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); + // Avoid tail folding if the trip count is known to be a multiple of any VF - // we chose. - // FIXME: The condition below pessimises the case for fixed-width vectors, - // when scalable VFs are also candidates for vectorization. - if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { - ElementCount MaxFixedVF = MaxFactors.FixedVF; - assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + // we choose. + std::optional<unsigned> MaxPowerOf2RuntimeVF = + MaxFactors.FixedVF.getFixedValue(); + if (MaxFactors.ScalableVF) { + std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); + if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { + MaxPowerOf2RuntimeVF = std::max<unsigned>( + *MaxPowerOf2RuntimeVF, + *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); + } else + MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. + } + + if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { + assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC - : MaxFixedVF.getFixedValue(); + unsigned MaxVFtimesIC = + UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); const SCEV *ExitCount = SE->getAddExpr( @@ -5134,7 +5176,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + CanFoldTailByMasking = true; return MaxFactors; } @@ -5187,7 +5229,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorElementCount = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType), + llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), ComputeScalableMaxVF); MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " @@ -5207,6 +5249,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( auto Min = Attr.getVScaleRangeMin(); WidestRegisterMinEC *= Min; } + + // When a scalar epilogue is required, at least one iteration of the scalar + // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a + // max VF that results in a dead vector loop. + if (ConstTripCount > 0 && requiresScalarEpilogue(true)) + ConstTripCount -= 1; + if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { // If loop trip count (TC) is known at compile time there is no point in @@ -5214,7 +5263,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // power of two which doesn't exceed TC. // If MaxVectorElementCount is scalable, we only fall back on a fixed VF // when the TC is less than or equal to the known number of lanes. - auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); + auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"); @@ -5228,7 +5277,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType), + llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); @@ -5273,9 +5322,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return MaxVF; } -std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { - if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); +/// Convenience function that returns the value of vscale_range iff +/// vscale_range.min == vscale_range.max or otherwise returns the value +/// returned by the corresponding TTI method. +static std::optional<unsigned> +getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { + const Function *Fn = L->getHeader()->getParent(); + if (Fn->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); auto Max = Attr.getVScaleRangeMax(); if (Max && Min == Max) @@ -5285,31 +5339,39 @@ std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { return TTI.getVScaleForTuning(); } -bool LoopVectorizationCostModel::isMoreProfitable( +bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && - MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // VecCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be + // some extra overheads, but for the purpose of comparing the costs of + // different VFs we can use this to compare the total loop-body cost + // expected after vectorization. + auto GetCostForTC = [MaxTripCount, this](unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { + return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) + : VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); + }; + auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); + auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); + return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional<unsigned> VScale = getVScaleForTuning()) { + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) @@ -5328,9 +5390,74 @@ bool LoopVectorizationCostModel::isMoreProfitable( return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( +static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, + OptimizationRemarkEmitter *ORE, + Loop *TheLoop) { + if (InvalidCosts.empty()) + return; + + // Emit a report of VFs with invalid costs in the loop. + + // Group the remarks per instruction, keeping the instruction order from + // InvalidCosts. + std::map<Instruction *, unsigned> Numbering; + unsigned I = 0; + for (auto &Pair : InvalidCosts) + if (!Numbering.count(Pair.first)) + Numbering[Pair.first] = I++; + + // Sort the list, first on instruction(number) then on VF. + sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { + if (Numbering[A.first] != Numbering[B.first]) + return Numbering[A.first] < Numbering[B.first]; + ElementCountComparator ECC; + return ECC(A.second, B.second); + }); + + // For a list of ordered instruction-vf pairs: + // [(load, vf1), (load, vf2), (store, vf1)] + // Group the instructions together to emit separate remarks for: + // load (vf1, vf2) + // store (vf1) + auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); + auto Subset = ArrayRef<InstructionVFPair>(); + do { + if (Subset.empty()) + Subset = Tail.take_front(1); + + Instruction *I = Subset.front().first; + + // If the next instruction is different, or if there are no other pairs, + // emit a remark for the collated subset. e.g. + // [(load, vf1), (load, vf2))] + // to emit: + // remark: invalid costs for 'load' at VF=(vf, vf2) + if (Subset == Tail || Tail[Subset.size()].first != I) { + std::string OutString; + raw_string_ostream OS(OutString); + assert(!Subset.empty() && "Unexpected empty range"); + OS << "Instruction with invalid costs prevented vectorization at VF=("; + for (const auto &Pair : Subset) + OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; + OS << "):"; + if (auto *CI = dyn_cast<CallInst>(I)) + OS << " call to " << CI->getCalledFunction()->getName(); + else + OS << " " << I->getOpcodeName(); + OS.flush(); + reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); + Tail = Tail.drop_front(Subset.size()); + Subset = {}; + } else + // Grow the subset by one element + Subset = Tail.take_front(Subset.size() + 1); + } while (!Tail.empty()); +} + +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = + CM.expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); assert(VFCandidates.count(ElementCount::getFixed(1)) && @@ -5340,7 +5467,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost @@ -5354,12 +5481,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( if (i.isScalar()) continue; - VectorizationCostTy C = expectedCost(i, &InvalidCosts); + LoopVectorizationCostModel::VectorizationCostTy C = + CM.expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (std::optional<unsigned> VScale = getVScaleForTuning()) + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() @@ -5388,70 +5516,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( ChosenFactor = Candidate; } - // Emit a report of VFs with invalid costs in the loop. - if (!InvalidCosts.empty()) { - // Group the remarks per instruction, keeping the instruction order from - // InvalidCosts. - std::map<Instruction *, unsigned> Numbering; - unsigned I = 0; - for (auto &Pair : InvalidCosts) - if (!Numbering.count(Pair.first)) - Numbering[Pair.first] = I++; - - // Sort the list, first on instruction(number) then on VF. - llvm::sort(InvalidCosts, - [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { - if (Numbering[A.first] != Numbering[B.first]) - return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); - }); - - // For a list of ordered instruction-vf pairs: - // [(load, vf1), (load, vf2), (store, vf1)] - // Group the instructions together to emit separate remarks for: - // load (vf1, vf2) - // store (vf1) - auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); - auto Subset = ArrayRef<InstructionVFPair>(); - do { - if (Subset.empty()) - Subset = Tail.take_front(1); - - Instruction *I = Subset.front().first; - - // If the next instruction is different, or if there are no other pairs, - // emit a remark for the collated subset. e.g. - // [(load, vf1), (load, vf2))] - // to emit: - // remark: invalid costs for 'load' at VF=(vf, vf2) - if (Subset == Tail || Tail[Subset.size()].first != I) { - std::string OutString; - raw_string_ostream OS(OutString); - assert(!Subset.empty() && "Unexpected empty range"); - OS << "Instruction with invalid costs prevented vectorization at VF=("; - for (const auto &Pair : Subset) - OS << (Pair.second == Subset.front().second ? "" : ", ") - << Pair.second; - OS << "):"; - if (auto *CI = dyn_cast<CallInst>(I)) - OS << " call to " << CI->getCalledFunction()->getName(); - else - OS << " " << I->getOpcodeName(); - OS.flush(); - reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); - Tail = Tail.drop_front(Subset.size()); - Subset = {}; - } else - // Grow the subset by one element - Subset = Tail.take_front(Subset.size() + 1); - } while (!Tail.empty()); - } + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", + if (!EnableCondStoresVectorization && CM.hasPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); + "ConditionalStore", ORE, OrigLoop); ChosenFactor = ScalarCost; } @@ -5463,11 +5534,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( - const Loop &L, ElementCount VF) const { +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( + ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(L.getHeader()->phis(), + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; @@ -5475,20 +5546,21 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( // currently unsupported. for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. - Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + Value *PostInc = + Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) - if (!L.contains(cast<Instruction>(U))) + if (!OrigLoop->contains(cast<Instruction>(U))) return false; // Look for uses of penultimate value of the induction. for (User *U : Entry.first->users()) - if (!L.contains(cast<Instruction>(U))) + if (!OrigLoop->contains(cast<Instruction>(U))) return false; } // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (L.getExitingBlock() != L.getLoopLatch()) + if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; @@ -5507,62 +5579,59 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - // FIXME: We should consider changing the threshold for scalable - // vectors to take VScaleForTuning into account. - if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) + + unsigned Multiplier = 1; + if (VF.isScalable()) + Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); + if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, unsigned IC) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); + if (!CM.isScalarEpilogueAllowed()) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " + "epilogue is allowed.\n"); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because the loop is " - "not a supported candidate.\n";); + if (!isCandidateForEpilogueVectorization(MainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " + "is not a supported candidate.\n"); return Result; } if (EpilogueVectorizationForceVF > 1) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) + if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { - LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization forced factor is not viable.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " + "viable.\n"); return Result; } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5574,21 +5643,48 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( ElementCount EstimatedRuntimeVF = MainLoopVF; if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional<unsigned> VScale = getVScaleForTuning()) + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) EstimatedRuntimeVF *= *VScale; } - for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + ScalarEvolution &SE = *PSE.getSE(); + Type *TCType = Legal->getWidestInductionType(); + const SCEV *RemainingIterations = nullptr; + for (auto &NextVF : ProfitableVFs) { + // Skip candidate VFs without a corresponding VPlan. + if (!hasPlanWithVF(NextVF.Width)) + continue; + + // Skip candidate VFs with widths >= the estimate runtime VF (scalable + // vectors) or the VF of the main loop (fixed vectors). + if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) + continue; + + // If NextVF is greater than the number of remaining iterations, the + // epilogue loop would be dead. Skip such factors. + if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { + // TODO: extend to support scalable VFs. + if (!RemainingIterations) { + const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); + RemainingIterations = SE.getURemExpr( + TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + } + if (SE.isKnownPredicate( + CmpInst::ICMP_UGT, + SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), + RemainingIterations)) + continue; + } + + if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) Result = NextVF; + } if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width << "\n";); + << Result.Width << "\n"); return Result; } @@ -5688,7 +5784,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return 1; // We used the distance for the interleave count. - if (Legal->getMaxSafeDepDistBytes() != -1U) + if (!Legal->isSafeForAnyVectorWidth()) return 1; auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); @@ -5750,20 +5846,19 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; - unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / + MaxLocalUsers); // Don't count the induction variable as interleaved. if (EnableIndVarRegisterHeur) { - TmpIC = - PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / - std::max(1U, (MaxLocalUsers - 1))); + TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); } IC = std::min(IC, TmpIC); } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = - TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the max. if (VF.isScalar()) { @@ -5834,8 +5929,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. - unsigned SmallIC = std::min( - IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue())); + unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( + SmallLoopCost / *LoopCost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -5953,7 +6048,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-instruction values such as arguments and // constants). - SmallPtrSet<Value *, 8> LoopInvariants; + SmallSetVector<Instruction *, 8> LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -6079,11 +6174,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + bool IsScalar = all_of(Inst->users(), [&](User *U) { + auto *I = cast<Instruction>(U); + return TheLoop != LI->getLoopFor(I->getParent()) || + isScalarAfterVectorization(I, VFs[i]); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); - Invariant[ClassID] += Usage; + TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); + Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ @@ -6134,8 +6234,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF.isScalar() || VF.isZero() || - InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -6224,7 +6323,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Instruction *I = Worklist.pop_back_val(); // If we've already analyzed the instruction, there's nothing to do. - if (ScalarCosts.find(I) != ScalarCosts.end()) + if (ScalarCosts.contains(I)) continue; // Compute the cost of the vector instruction. Note that this cost already @@ -6362,11 +6461,6 @@ static const SCEV *getAddressAccessSCEV( return PSE.getSCEV(Ptr); } -static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - return Legal->hasStride(I->getOperand(0)) || - Legal->hasStride(I->getOperand(1)); -} - InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, ElementCount VF) { @@ -6460,7 +6554,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { - assert(Legal->isUniformMemOp(*I)); + assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); @@ -6475,7 +6569,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, } StoreInst *SI = cast<StoreInst>(I); - bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); + bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + @@ -6502,11 +6596,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - // TODO: Once we have support for interleaving with scalable vectors - // we can calculate the cost properly here. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6836,7 +6925,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; - if (Legal->isUniformMemOp(I)) { + if (Legal->isUniformMemOp(I, VF)) { auto isLegalToScalarize = [&]() { if (!VF.isScalable()) // Scalarization of fixed length vectors "just works". @@ -7134,8 +7223,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - // Since we will replace the stride by 1 the multiplication should go away. - if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) + // If we're speculating on the stride being 1, the multiplication may + // fold away. We can generalize this for all operations using the notion + // of neutral elements. (TODO) + if (I->getOpcode() == Instruction::Mul && + (PSE.getSCEV(I->getOperand(0))->isOne() || + PSE.getSCEV(I->getOperand(1))->isOne())) return 0; // Detect reduction patterns @@ -7146,7 +7239,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); @@ -7304,7 +7398,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { - SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + // Leave SrcVecTy unchanged - we only shrink the destination element + // type. VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } @@ -7316,9 +7411,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; - bool NeedToScalarize; + Function *Variant; CallInst *CI = cast<CallInst>(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -7339,37 +7434,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } // end of switch. } -char LoopVectorize::ID = 0; - -static const char lv_name[] = "Loop Vectorization"; - -INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) -INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) -INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) - -namespace llvm { - -Pass *createLoopVectorizePass() { return new LoopVectorize(); } - -Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, - bool VectorizeOnlyWhenForced) { - return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); -} - -} // end namespace llvm - void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); @@ -7462,7 +7526,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { // reasonable one. if (UserVF.isZero()) { VF = ElementCount::getFixed(determineVPlanVF( - TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); @@ -7497,13 +7561,16 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { std::optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + CM.collectValuesToIgnore(); + CM.collectElementTypesForWidening(); + FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && - !useMaskedInterleavedAccesses(*TTI)) { + !useMaskedInterleavedAccesses(TTI)) { LLVM_DEBUG( dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " @@ -7527,6 +7594,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); + if (!hasPlanWithVF(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF + << ".\n"); + return std::nullopt; + } + LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0, 0}}; } else @@ -7562,8 +7635,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = selectVectorizationFactor(VFCandidates); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + if (!hasPlanWithVF(VF.Width)) { + LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width + << ".\n"); + return std::nullopt; + } return VF; } @@ -7614,43 +7692,51 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, - VPlan &BestVPlan, - InnerLoopVectorizer &ILV, - DominatorTree *DT, - bool IsEpilogueVectorization) { +SCEV2ValueTy LoopVectorizationPlanner::executePlan( + ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, + InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, + DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); + assert( + (IsEpilogueVectorization || !ExpandedSCEVs) && + "expanded SCEVs to reuse can only be used during epilogue vectorization"); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); - if (!IsEpilogueVectorization) VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); // Perform the actual loop transformation. + VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + + // 0. Generate SCEV-dependent code into the preheader, including TripCount, + // before making any changes to the CFG. + if (!BestVPlan.getPreheader()->empty()) { + State.CFG.PrevBB = OrigLoop->getLoopPreheader(); + State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); + BestVPlan.getPreheader()->execute(&State); + } + if (!ILV.getTripCount()) + ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); + else + assert(IsEpilogueVectorization && "should only re-use the existing trip " + "count during epilogue vectorization"); // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs + : State.ExpandedSCEVs); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. const LoopAccessInfo *LAI = ILV.Legal->getLAI(); + std::unique_ptr<LoopVersioning> LVer = nullptr; if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && !LAI->getRuntimePointerChecking()->getDiffChecks()) { @@ -7658,9 +7744,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // still use it to add the noalias metadata. // TODO: Find a better way to re-use LoopVersioning functionality to add // metadata. - State.LVer = std::make_unique<LoopVersioning>( + LVer = std::make_unique<LoopVersioning>( *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); + State.LVer = &*LVer; State.LVer->prepareNoAliasMetadata(); } @@ -7677,10 +7764,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State, - IsEpilogueVectorization); + BestVPlan.prepareToExecute( + ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State, IsEpilogueVectorization); BestVPlan.execute(&State); @@ -7706,13 +7792,18 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } - AddRuntimeUnrollDisableMetaData(L); + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); + + return State.ExpandedSCEVs; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -7725,8 +7816,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } - //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// @@ -7734,7 +7823,8 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair<BasicBlock *, Value *> -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7795,7 +7885,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -7803,8 +7893,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. - auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? - ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() + : VF.isVector()) + ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), @@ -7824,7 +7916,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, // Update dominator for Bypass & LoopExit. DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // For loops with multiple exits, there's no edge from the middle block // to exit blocks (as the epilogue must run) and thus no need to update // the immediate dominator of the exit blocks. @@ -7852,7 +7944,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair<BasicBlock *, Value *> -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7891,7 +7984,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -7950,7 +8043,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues({VecEpilogueIterationCountCheck, + createInductionResumeValues(ExpandedSCEVs, + {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); return {completeLoopSkeleton(), EPResumeVal}; @@ -7972,8 +8066,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. - auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? - ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) + ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp(P, Count, @@ -8008,8 +8103,7 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); - for (ElementCount TmpVF = Range.Start * 2; - ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) + for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; @@ -8025,16 +8119,16 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, ElementCount MaxVF) { - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; + auto MaxVFTimes2 = MaxVF * 2; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { + VFRange SubRange = {VF, MaxVFTimes2}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } } VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, - VPlanPtr &Plan) { + VPlan &Plan) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. @@ -8058,7 +8152,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; - VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); + VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) @@ -8069,7 +8163,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, // 'select i1 SrcMask, i1 EdgeMask, i1 false'. // The select version does not introduce new UB if SrcMask is false and // EdgeMask is poison. Using 'and' here introduces undefined behavior. - VPValue *False = Plan->getOrAddVPValue( + VPValue *False = Plan.getVPValueOrAddLiveIn( ConstantInt::getFalse(BI->getCondition()->getType())); EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); @@ -8078,7 +8172,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, return EdgeMaskCache[Edge] = EdgeMask; } -VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Look for cached value. @@ -8098,29 +8192,28 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); - if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) - return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) + return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - VPBasicBlock *HeaderVPBB = - Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (EmitGetActiveLaneMask != PredicationStyle::None) { - VPValue *TC = Plan->getOrCreateTripCount(); + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); } else { - VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; @@ -8168,7 +8261,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); + Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8189,22 +8282,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also /// insert a recipe to expand the step for the induction recipe. -static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( - PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, - const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, - VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { - // Returns true if an instruction \p I should be scalarized instead of - // vectorized for the chosen vectorization factor. - auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF); - }; - - bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - return ShouldScalarizeInstruction(PhiOrTrunc, VF); - }, - Range); +static VPWidenIntOrFpInductionRecipe * +createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, + VPValue *Start, const InductionDescriptor &IndDesc, + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + VFRange &Range) { assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && @@ -8213,12 +8295,10 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, - !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); } assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, - !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8227,14 +8307,13 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, + return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, *PSE.getSE(), *OrigLoop, Range); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), *PSE.getSE()); - assert(isa<SCEVConstant>(II->getStep())); return new VPWidenPointerInductionRecipe( Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8267,9 +8346,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); - VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, - *PSE.getSE(), *OrigLoop, Range); + VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); + return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), + *OrigLoop, Range); } return nullptr; } @@ -8309,7 +8388,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = - createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); + createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); OperandsWithMask.push_back(Operands[In]); @@ -8321,8 +8400,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range) const { - + VFRange &Range, + VPlanPtr &Plan) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -8339,17 +8418,17 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); + SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { - bool NeedToScalarize = false; + Function *Variant; // Is it beneficial to perform intrinsic call compared to lib // call? InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, NeedToScalarize); + CM.getVectorCallCost(CI, VF, &Variant); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; @@ -8358,6 +8437,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); + Function *Variant = nullptr; + ElementCount VariantVF; + bool NeedsMask = false; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8365,14 +8447,57 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // The following case may be scalarized depending on the VF. // The flag shows whether we can use a usual Call for vectorized // version of the instruction. - bool NeedToScalarize = false; - CM.getVectorCallCost(CI, VF, NeedToScalarize); - return !NeedToScalarize; + + // If we've found a variant at a previous VF, then stop looking. A + // vectorized variant of a function expects input in a certain shape + // -- basically the number of input registers, the number of lanes + // per register, and whether there's a mask required. + // We store a pointer to the variant in the VPWidenCallRecipe, so + // once we have an appropriate variant it's only valid for that VF. + // This will force a different vplan to be generated for each VF that + // finds a valid variant. + if (Variant) + return false; + CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); + // If we found a valid vector variant at this VF, then store the VF + // in case we need to generate a mask. + if (Variant) + VariantVF = VF; + return Variant != nullptr; }, Range); - if (ShouldUseVectorCall) + if (ShouldUseVectorCall) { + if (NeedsMask) { + // We have 2 cases that would require a mask: + // 1) The block needs to be predicated, either due to a conditional + // in the scalar loop or use of an active lane mask with + // tail-folding, and we use the appropriate mask for the block. + // 2) No mask is required for the block, but the only available + // vector variant at this VF requires a mask, so we synthesize an + // all-true mask. + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(CI)) + Mask = createBlockInMask(CI->getParent(), *Plan); + else + Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( + IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); + + VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); + unsigned MaskPos = 0; + + for (const VFInfo &Info : VFDatabase::getMappings(*CI)) + if (Info.Shape == Shape) { + assert(Info.isMasked() && "Vector function info shape mismatch"); + MaskPos = Info.getParamIndexForOptionalMask().value(); + break; + } + + Ops.insert(Ops.begin() + MaskPos, Mask); + } + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), - Intrinsic::not_intrinsic); + Intrinsic::not_intrinsic, Variant); + } return nullptr; } @@ -8405,9 +8530,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); - VPValue *Mask = createBlockInMask(I->getParent(), Plan); - VPValue *One = - Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false)); + VPValue *Mask = createBlockInMask(I->getParent(), *Plan); + VPValue *One = Plan->getVPValueOrAddLiveIn( + ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, I->getDebugLoc()); @@ -8415,38 +8540,26 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, Ops[1] = SafeRHS; return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); } - LLVM_FALLTHROUGH; + [[fallthrough]]; } case Instruction::Add: case Instruction::And: case Instruction::AShr: - case Instruction::BitCast: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: case Instruction::FMul: case Instruction::FNeg: - case Instruction::FPExt: - case Instruction::FPToSI: - case Instruction::FPToUI: - case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: case Instruction::ICmp: - case Instruction::IntToPtr: case Instruction::LShr: case Instruction::Mul: case Instruction::Or: - case Instruction::PtrToInt: case Instruction::Select: - case Instruction::SExt: case Instruction::Shl: - case Instruction::SIToFP: case Instruction::Sub: - case Instruction::Trunc: - case Instruction::UIToFP: case Instruction::Xor: - case Instruction::ZExt: case Instruction::Freeze: return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); }; @@ -8462,9 +8575,9 @@ void VPRecipeBuilder::fixHeaderPhis() { } } -VPBasicBlock *VPRecipeBuilder::handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - VPlanPtr &Plan) { +VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, + VFRange &Range, + VPlan &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); @@ -8501,83 +8614,22 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( break; } } - - auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), - IsUniform, IsPredicated); - - // Find if I uses a predicated instruction. If so, it will use its scalar - // value. Avoid hoisting the insert-element which packs the scalar value into - // a vector value, as that happens iff all users use the vector value. - for (VPValue *Op : Recipe->operands()) { - auto *PredR = - dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe()); - if (!PredR) - continue; - auto *RepR = cast<VPReplicateRecipe>( - PredR->getOperand(0)->getDefiningRecipe()); - assert(RepR->isPredicated() && - "expected Replicate recipe to be predicated"); - RepR->setAlsoPack(false); - } - - // Finalize the recipe for Instr, first if it is not predicated. + VPValue *BlockInMask = nullptr; if (!IsPredicated) { + // Finalize the recipe for Instr, first if it is not predicated. LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); - setRecipe(I, Recipe); - Plan->addVPValue(I, Recipe); - VPBB->appendRecipe(Recipe); - return VPBB; - } - LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - - VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); - assert(SingleSucc && "VPBB must have a single successor when handling " - "predicated replication."); - VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); - // Record predicated instructions for above packing optimizations. - VPBlockBase *Region = createReplicateRegion(Recipe, Plan); - VPBlockUtils::insertBlockAfter(Region, VPBB); - auto *RegSucc = new VPBasicBlock(); - VPBlockUtils::insertBlockAfter(RegSucc, Region); - VPBlockUtils::connectBlocks(RegSucc, SingleSucc); - return RegSucc; -} - -VPRegionBlock * -VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe, - VPlanPtr &Plan) { - Instruction *Instr = PredRecipe->getUnderlyingInstr(); - // Instructions marked for predication are replicated and placed under an - // if-then construct to prevent side-effects. - // Generate recipes to compute the block mask for this region. - VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); - - // Build the triangular if-then region. - std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); - assert(Instr->getParent() && "Predicated instruction not in any basic block"); - auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = Instr->getType()->isVoidTy() - ? nullptr - : new VPPredInstPHIRecipe(PredRecipe); - if (PHIRecipe) { - setRecipe(Instr, PHIRecipe); - Plan->addVPValue(Instr, PHIRecipe); } else { - setRecipe(Instr, PredRecipe); - Plan->addVPValue(Instr, PredRecipe); + LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + // Instructions marked for predication are replicated and a mask operand is + // added initially. Masked replicate recipes will later be placed under an + // if-then construct to prevent side-effects. Generate recipes to compute + // the block mask for this region. + BlockInMask = createBlockInMask(I->getParent(), Plan); } - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); - - // Note: first set Entry as region entry and then connect successors starting - // from it in order, to propagate the "parent" of each VPBasicBlock. - VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); - VPBlockUtils::connectBlocks(Pred, Exiting); - - return Region; + auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), + IsUniform, BlockInMask); + return toVPRecipeResult(Recipe); } VPRecipeOrVPValueTy @@ -8643,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return nullptr; if (auto *CI = dyn_cast<CallInst>(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); @@ -8653,13 +8705,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) return toVPRecipeResult(new VPWidenGEPRecipe( - GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); + GEP, make_range(Operands.begin(), Operands.end()))); if (auto *SI = dyn_cast<SelectInst>(Instr)) { - bool InvariantCond = - PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); return toVPRecipeResult(new VPWidenSelectRecipe( - *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); + *SI, make_range(Operands.begin(), Operands.end()))); + } + + if (auto *CI = dyn_cast<CastInst>(Instr)) { + return toVPRecipeResult( + new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); } return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); @@ -8677,34 +8732,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - // Dead instructions do not need sinking. Remove them from SinkAfter. - for (Instruction *I : DeadInstructions) - SinkAfter.erase(I); - - // Cannot sink instructions after dead instructions (there won't be any - // recipes for them). Instead, find the first non-dead previous instruction. - for (auto &P : Legal->getSinkAfter()) { - Instruction *SinkTarget = P.second; - Instruction *FirstInst = &*SinkTarget->getParent()->begin(); - (void)FirstInst; - while (DeadInstructions.contains(SinkTarget)) { - assert( - SinkTarget != FirstInst && - "Must find a live instruction (at least the one feeding the " - "fixed-order recurrence PHI) before reaching beginning of the block"); - SinkTarget = SinkTarget->getPrevNode(); - assert(SinkTarget != P.first && - "sink source equals target, no sinking required"); - } - P.second = SinkTarget; - } - - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + auto MaxVFTimes2 = MaxVF * 2; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { + VFRange SubRange = {VF, MaxVFTimes2}; + if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) + VPlans.push_back(std::move(*Plan)); VF = SubRange.End; } } @@ -8712,10 +8744,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, - bool UseLaneMaskForLoopControlFlow) { + TailFoldingStyle Style) { Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getOrAddVPValue(StartIdx); + auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); @@ -8725,6 +8756,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. + bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8732,11 +8764,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - EB->appendRecipe(CanonicalIVIncrement); - - if (UseLaneMaskForLoopControlFlow) { + if (useActiveLaneMaskForControlFlow(Style)) { // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + VPBasicBlock *VecPreheader = + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at @@ -8745,14 +8776,34 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, {StartV}, DL, "index.part.next"); - Preheader->appendRecipe(CanonicalIVIncrementParts); + VecPreheader->appendRecipe(CanonicalIVIncrementParts); // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); + + VPValue *TripCount, *IncrementValue; + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + auto *TCMinusVF = + new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); + VecPreheader->appendRecipe(TCMinusVF); + IncrementValue = CanonicalIVPHI; + TripCount = TCMinusVF; + } else { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + EB->appendRecipe(CanonicalIVIncrement); + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, {CanonicalIVIncrementParts, TC}, DL, "active.lane.mask.entry"); - Preheader->appendRecipe(EntryALM); + VecPreheader->appendRecipe(EntryALM); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -8763,15 +8814,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, CanonicalIVIncrementParts = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, - {CanonicalIVIncrement}, DL); + {IncrementValue}, DL); EB->appendRecipe(CanonicalIVIncrementParts); auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TC}, DL, + {CanonicalIVIncrementParts, TripCount}, DL, "active.lane.mask.next"); EB->appendRecipe(ALM); LaneMaskPhi->addOperand(ALM); + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // Do the increment of the canonical IV after the active.lane.mask, because + // that value is still based off %CanonicalIVPHI + EB->appendRecipe(CanonicalIVIncrement); + } + // We have to invert the mask here because a true condition means jumping // to the exit block. auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); @@ -8781,6 +8838,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); EB->appendRecipe(BranchBack); } else { + EB->appendRecipe(CanonicalIVIncrement); + // Add the BranchOnCount VPInstruction to the latch. VPInstruction *BranchBack = new VPInstruction( VPInstruction::BranchOnCount, @@ -8804,14 +8863,13 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); - VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); Plan.addLiveOut(&ExitPhi, V); } } -VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, - const MapVector<Instruction *, Instruction *> &SinkAfter) { +std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) { SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; @@ -8822,12 +8880,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (const auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = @@ -8852,9 +8904,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // single VPInterleaveRecipe. for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 - CM.getWideningDecision(IG->getInsertPos(), VF) == - LoopVectorizationCostModel::CM_Interleave); + bool Result = (VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + // For scalable vectors, the only interleave factor currently supported + // is 2 since we require the (de)interleave2 intrinsics instead of + // shufflevectors. + assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + "Unsupported interleave factor for scalable vectors"); + return Result; }; if (!getDecisionAndClampRange(applyIG, Range)) continue; @@ -8869,26 +8927,34 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, starting with a block for the pre-header, - // followed by a region for the vector loop, followed by the middle block. The - // skeleton vector loop region contains a header and latch block. - VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique<VPlan>(Preheader); - + // Create initial VPlan skeleton, having a basic block for the pre-header + // which contains SCEV expansions that need to happen before the CFG is + // modified; a basic block for the vector pre-header, followed by a region for + // the vector loop, followed by the middle basic block. The skeleton vector + // loop region contains a header and latch basic blocks. + VPlanPtr Plan = VPlan::createInitialVPlan( + createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), + *PSE.getSE()); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); + // Don't use getDecisionAndClampRange here, because we don't know the UF + // so this function is better to be conservative, rather than to split + // it up into different VPlans. + bool IVUpdateMayOverflow = false; + for (ElementCount VF : Range) + IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); + Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), - CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle(IVUpdateMayOverflow)); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8896,18 +8962,16 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; - SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. - unsigned VPBBsForBB = 0; if (VPBB != HeaderVPBB) VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. - for (Instruction &I : BB->instructionsWithoutDebug()) { + for (Instruction &I : BB->instructionsWithoutDebug(false)) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are @@ -8918,7 +8982,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( SmallVector<VPValue *, 4> Operands; auto *Phi = dyn_cast<PHINode>(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { - Operands.push_back(Plan->getOrAddVPValue( + Operands.push_back(Plan->getVPValueOrAddLiveIn( Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); } else { auto OpRange = Plan->mapToVPValues(Instr->operands()); @@ -8932,50 +8996,36 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) continue; - if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( - Instr, Operands, Range, VPBB, Plan)) { - // If Instr can be simplified to an existing VPValue, use it. - if (RecipeOrValue.is<VPValue *>()) { - auto *VPV = RecipeOrValue.get<VPValue *>(); - Plan->addVPValue(Instr, VPV); - // If the re-used value is a recipe, register the recipe for the - // instruction, in case the recipe for Instr needs to be recorded. - if (VPRecipeBase *R = VPV->getDefiningRecipe()) - RecipeBuilder.setRecipe(Instr, R); - continue; - } - // Otherwise, add the new recipe. - VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); - for (auto *Def : Recipe->definedValues()) { - auto *UV = Def->getUnderlyingValue(); - Plan->addVPValue(UV, Def); - } - - if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && - HeaderVPBB->getFirstNonPhi() != VPBB->end()) { - // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section - // of the header block. That can happen for truncates of induction - // variables. Those recipes are moved to the phi section of the header - // block after applying SinkAfter, which relies on the original - // position of the trunc. - assert(isa<TruncInst>(Instr)); - InductionsToMove.push_back( - cast<VPWidenIntOrFpInductionRecipe>(Recipe)); - } - RecipeBuilder.setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); + auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( + Instr, Operands, Range, VPBB, Plan); + if (!RecipeOrValue) + RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); + // If Instr can be simplified to an existing VPValue, use it. + if (isa<VPValue *>(RecipeOrValue)) { + auto *VPV = cast<VPValue *>(RecipeOrValue); + Plan->addVPValue(Instr, VPV); + // If the re-used value is a recipe, register the recipe for the + // instruction, in case the recipe for Instr needs to be recorded. + if (VPRecipeBase *R = VPV->getDefiningRecipe()) + RecipeBuilder.setRecipe(Instr, R); continue; } - - // Otherwise, if all widening options failed, Instruction is to be - // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); - if (NextVPBB != VPBB) { - VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); + // Otherwise, add the new recipe. + VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); + for (auto *Def : Recipe->definedValues()) { + auto *UV = Def->getUnderlyingValue(); + Plan->addVPValue(UV, Def); } + + RecipeBuilder.setRecipe(Instr, Recipe); + if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && + HeaderVPBB->getFirstNonPhi() != VPBB->end()) { + // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the + // phi section of HeaderVPBB. + assert(isa<TruncInst>(Instr)); + Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + } else + VPBB->appendRecipe(Recipe); } VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); @@ -8985,7 +9035,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + if (CM.requiresScalarEpilogue(Range)) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + } else + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && @@ -8998,116 +9053,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // bring the VPlan to its final state. // --------------------------------------------------------------------------- - // Apply Sink-After legal constraints. - auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); - if (Region && Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && - Region->getNumPredecessors() == 1 && "Expected SESE region!"); - assert(R->getParent()->size() == 1 && - "A recipe in an original replicator region must be the only " - "recipe in its block"); - return Region; - } - return nullptr; - }; - for (const auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - - auto *TargetRegion = GetReplicateRegion(Target); - auto *SinkRegion = GetReplicateRegion(Sink); - if (!SinkRegion) { - // If the sink source is not a replicate region, sink the recipe directly. - if (TargetRegion) { - // The target is in a replication region, make sure to move Sink to - // the block after it, not into the replication region itself. - VPBasicBlock *NextBlock = - cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - } else - Sink->moveAfter(Target); - continue; - } - - // The sink source is in a replicate region. Unhook the region from the CFG. - auto *SinkPred = SinkRegion->getSinglePredecessor(); - auto *SinkSucc = SinkRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); - VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); - VPBlockUtils::connectBlocks(SinkPred, SinkSucc); - - if (TargetRegion) { - // The target recipe is also in a replicate region, move the sink region - // after the target region. - auto *TargetSucc = TargetRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); - VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); - } else { - // The sink source is in a replicate region, we need to move the whole - // replicate region, which should only contain a single recipe in the - // main block. - auto *SplitBlock = - Target->getParent()->splitAt(std::next(Target->getIterator())); - - auto *SplitPred = SplitBlock->getSinglePredecessor(); - - VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); - VPBlockUtils::connectBlocks(SplitPred, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - } - } - - VPlanTransforms::removeRedundantCanonicalIVs(*Plan); - VPlanTransforms::removeRedundantInductionCasts(*Plan); - - // Now that sink-after is done, move induction recipes for optimized truncates - // to the phi section of the header block. - for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) - Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); - // Introduce a recipe to combine the incoming and previous values of a - // fixed-order recurrence. - for (VPRecipeBase &R : - Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); - if (!RecurPhi) - continue; - - VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); - // Fixed-order recurrences do not contain cycles, so this loop is guaranteed - // to terminate. - while (auto *PrevPhi = - dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe)) - PrevRecipe = &PrevPhi->getBackedgeRecipe(); - VPBasicBlock *InsertBlock = PrevRecipe->getParent(); - auto *Region = GetReplicateRegion(PrevRecipe); - if (Region) - InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); - if (!InsertBlock) { - InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); - VPBlockUtils::insertBlockAfter(InsertBlock, Region); - } - if (Region || PrevRecipe->isPhi()) - Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); - else - Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); - - auto *RecurSplice = cast<VPInstruction>( - Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, - {RecurPhi, RecurPhi->getBackedgeValue()})); - - RecurPhi->replaceAllUsesWith(RecurSplice); - // Set the first operand of RecurSplice to RecurPhi again, after replacing - // all users. - RecurSplice->setOperand(0, RecurPhi); - } - // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. @@ -9122,48 +9071,66 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( StoredValues.push_back(StoreR->getStoredValue()); } + bool NeedsMaskForGaps = + IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, - Recipe->getMask()); + Recipe->getMask(), NeedsMaskForGaps); VPIG->insertBefore(Recipe); unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { + VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); if (!Member->getType()->isVoidTy()) { - VPValue *OriginalV = Plan->getVPValue(Member); - Plan->removeVPValueFor(Member); - Plan->addVPValue(Member, VPIG->getVPValue(J)); + VPValue *OriginalV = MemberR->getVPSingleValue(); OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); J++; } - RecipeBuilder.getRecipe(Member)->eraseFromParent(); + MemberR->eraseFromParent(); } } - for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); - VF *= 2) + for (ElementCount VF : Range) Plan->addVF(VF); Plan->setName("Initial VPlan"); + // Replace VPValues for known constant strides guaranteed by predicate scalar + // evolution. + for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { + auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); + auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); + // Only handle constant strides for now. + if (!ScevStride) + continue; + Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); + + auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); + // The versioned value may not be used in the loop directly, so just add a + // new live-in in those cases. + Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); + } + // From this point onwards, VPlan-to-VPlan transformations may change the plan // in ways that accessing values using original IR values is incorrect. Plan->disableValue2VPValue(); + // Sink users of fixed-order recurrence past the recipe defining the previous + // value and introduce FirstOrderRecurrenceSplice VPInstructions. + if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) + return std::nullopt; + + VPlanTransforms::removeRedundantCanonicalIVs(*Plan); + VPlanTransforms::removeRedundantInductionCasts(*Plan); + VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::removeDeadRecipes(*Plan); - bool ShouldSimplify = true; - while (ShouldSimplify) { - ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan); - ShouldSimplify |= - VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan); - ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); - } + VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); - return Plan; + return std::make_optional(std::move(Plan)); } VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { @@ -9175,21 +9142,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = std::make_unique<VPlan>(); + auto Plan = VPlan::createInitialVPlan( + createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), + *PSE.getSE()); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); - VF *= 2) + for (ElementCount VF : Range) Plan->addVF(VF); - SmallPtrSet<Instruction *, 1> DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( - OrigLoop, Plan, + Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, - DeadInstructions, *PSE.getSE(), *TLI); + *PSE.getSE(), *TLI); // Remove the existing terminator of the exiting block of the top-most region. // A BranchOnCount will be added instead when adding the canonical IV recipes. @@ -9198,7 +9165,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); return Plan; } @@ -9255,7 +9222,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(WidenRecipe->getParent(), WidenRecipe->getIterator()); - CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan); + CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); } if (IsFMulAdd) { @@ -9270,7 +9237,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VecOp = FMulRecipe; } VPReductionRecipe *RedRecipe = - new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9304,13 +9271,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (!PhiR || PhiR->isInLoop()) continue; VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); VPValue *Red = PhiR->getBackedgeValue(); assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } + + VPlanTransforms::clearReductionWrapFlags(*Plan); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -9475,7 +9444,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); - Value *Step = State.get(getOperand(1), VPIteration(0, Part)); + Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); Value *SclrGep = emitTransformedIndex( State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); SclrGep->setName("next.gep"); @@ -9485,8 +9454,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { return; } - assert(isa<SCEVConstant>(IndDesc.getStep()) && - "Induction step not a SCEV constant!"); Type *PhiType = IndDesc.getStep()->getType(); // Build a pointer phi @@ -9506,7 +9473,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { Value *NumUnrolledElems = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); Value *InductionGEP = GetElementPtrInst::Create( - IndDesc.getElementType(), NewPointerPhi, + State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", InductionLoc); // Add induction update using an incorrect block temporarily. The phi node @@ -9529,10 +9496,10 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); - assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && + assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( - IndDesc.getElementType(), NewPointerPhi, + State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul( StartOffset, State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), @@ -9584,7 +9551,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), - getStoredValues(), getMask()); + getStoredValues(), getMask(), + NeedsMaskForGaps); } void VPReductionRecipe::execute(VPTransformState &State) { @@ -9640,10 +9608,9 @@ void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(UI, this, *State.Instance, - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF.isVector()) { + if (State.VF.isVector() && shouldPack()) { // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); @@ -9663,8 +9630,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { all_of(operands(), [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); })) { - State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated, - State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); if (user_begin() != user_end()) { for (unsigned Part = 1; Part < State.UF; ++Part) State.set(this, State.get(this, VPIteration(0, 0)), @@ -9676,16 +9642,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Uniform within VL means we need to generate lane 0 only for each // unrolled copy. for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); return; } - // A store of a loop varying value to a loop invariant address only - // needs only the last copy of the store. - if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) { + // A store of a loop varying value to a uniform address only needs the last + // copy of the store. + if (isa<StoreInst>(UI) && + vputils::isUniformAfterVectorization(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); - State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated, + State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), State); return; } @@ -9695,8 +9661,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { const unsigned EndLane = State.VF.getKnownMinValue(); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { @@ -9714,7 +9679,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool CreateGatherScatter = !Consecutive; + bool CreateGatherScatter = !isConsecutive(); auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); @@ -9725,36 +9690,39 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - + Value *PartPtr = nullptr; + + // Use i32 for the gep index type when the value is constant, + // or query DataLayout for a more suitable index type otherwise. + const DataLayout &DL = + Builder.GetInsertBlock()->getModule()->getDataLayout(); + Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) + ? DL.getIndexType(ScalarDataTy->getPointerTo()) + : Builder.getInt32Ty(); bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) InBounds = gep->isInBounds(); - if (Reverse) { + if (isReverse()) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. // RunTimeVF = VScale * VF.getKnownMinValue() // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + Value *NumElt = + Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + Value *LastLane = + Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); + Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); @@ -9774,7 +9742,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); } else { - if (Reverse) { + if (isReverse()) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); @@ -9833,7 +9801,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { static ScalarEpilogueLowering getScalarEpilogueLowering( Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. @@ -9869,7 +9836,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( }; // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) + TailFoldingInfo TFI(TLI, &LVL, IAI); + if (TTI->preferPredicateOverEpilogue(&TFI)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; @@ -9880,9 +9848,29 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { if (hasVectorValue(Def, Part)) return Data.PerPartOutput[Def][Part]; + auto GetBroadcastInstrs = [this, Def](Value *V) { + bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); + if (VF.isScalar()) + return V; + // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); + if (SafeToHoist) { + BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( + Plan->getVectorLoopRegion()->getSinglePredecessor())]; + if (LoopVectorPreHeader) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + } + + // Place the code for broadcasting invariant variables in the new preheader. + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); + + return Shuf; + }; + if (!hasScalarValue(Def, {Part, 0})) { Value *IRV = Def->getLiveInIRValue(); - Value *B = ILV->getBroadcastInstrs(IRV); + Value *B = GetBroadcastInstrs(IRV); set(Def, B, Part); return B; } @@ -9900,9 +9888,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { - // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform. + // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and + // VPExpandSCEVRecipes can also be uniform. assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || - isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && + isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || + isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -9927,7 +9917,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { // State, we will only generate the insertelements once. Value *VectorValue = nullptr; if (IsUniform) { - VectorValue = ILV->getBroadcastInstrs(ScalarValue); + VectorValue = GetBroadcastInstrs(ScalarValue); set(Def, VectorValue, Part); } else { // Initialize packing with insertelements to start from undef. @@ -9962,15 +9952,15 @@ static bool processLoopInVPlanNativePath( Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10231,8 +10221,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -10309,11 +10299,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); - CM.collectValuesToIgnore(); - CM.collectElementTypesForWidening(); - // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10342,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( @@ -10464,7 +10452,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue @@ -10475,8 +10463,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, - DT, true); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10492,6 +10480,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + EpilogILV.setTripCount(MainILV.getTripCount()); + for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { + auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); + auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( + ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + ExpandR->eraseFromParent(); + } + // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. @@ -10520,15 +10523,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { } ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); + IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), + {EPI.MainLoopIterationCountCheck}); } assert(ResumeV && "Must have a resume value"); - VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); + VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); + DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) @@ -10581,14 +10585,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, - DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, + DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; DT = &DT_; - BFI = &BFI_; + BFI = BFI_; TLI = TLI_; AC = &AC_; LAIs = &LAIs_; @@ -10604,7 +10608,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl( // vector registers, loop vectorization may still enable scalar // interleaving. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && - TTI->getMaxInterleaveFactor(1) < 2) + TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) return LoopVectorizeResult(false, false); bool Changed = false, CFGChanged = false; @@ -10656,7 +10660,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); @@ -10666,12 +10669,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + BlockFrequencyInfo *BFI = nullptr; + if (PSI && PSI->hasProfileSummary()) + BFI = &AM.getResult<BlockFrequencyAnalysis>(F); LoopVectorizeResult Result = runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; + if (isAssignmentTrackingEnabled(*F.getParent())) { + for (auto &BB : F) + RemoveRedundantDbgInstrs(&BB); + } + // We currently do not preserve loopinfo/dominator analyses with outer loop // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. @@ -10679,6 +10690,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, if (!EnableVPlanNativePath) { PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<ScalarEvolutionAnalysis>(); + +#ifdef EXPENSIVE_CHECKS + SE.verify(); +#endif } if (Result.MadeCFGChange) { @@ -10699,8 +10715,8 @@ void LoopVectorizePass::printPipeline( static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( OS, MapClassName2PassName); - OS << "<"; + OS << '<'; OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; - OS << ">"; + OS << '>'; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3eb6b1804e7..821a3fa22a85 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -87,7 +87,6 @@ #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -126,6 +125,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run +// even if we match a reduction but do not vectorize in the end. +static cl::opt<bool> AllowHorRdxIdenityOptimization( + "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, + cl::desc("Allow optimization of original scalar identity operations on " + "matched horizontal reductions.")); + static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -287,7 +293,7 @@ static bool isCommutative(Instruction *I) { /// \returns inserting index of InsertElement or InsertValue instruction, /// using Offset as base offset for index. static std::optional<unsigned> getInsertIndex(const Value *InsertInst, - unsigned Offset = 0) { + unsigned Offset = 0) { int Index = Offset; if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); @@ -342,16 +348,16 @@ enum class UseMask { static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, UseMask MaskArg) { SmallBitVector UseMask(VF, true); - for (auto P : enumerate(Mask)) { - if (P.value() == UndefMaskElem) { + for (auto [Idx, Value] : enumerate(Mask)) { + if (Value == PoisonMaskElem) { if (MaskArg == UseMask::UndefsAsMask) - UseMask.reset(P.index()); + UseMask.reset(Idx); continue; } - if (MaskArg == UseMask::FirstArg && P.value() < VF) - UseMask.reset(P.value()); - else if (MaskArg == UseMask::SecondArg && P.value() >= VF) - UseMask.reset(P.value() - VF); + if (MaskArg == UseMask::FirstArg && Value < VF) + UseMask.reset(Value); + else if (MaskArg == UseMask::SecondArg && Value >= VF) + UseMask.reset(Value - VF); } return UseMask; } @@ -374,9 +380,9 @@ static SmallBitVector isUndefVector(const Value *V, if (!UseMask.empty()) { const Value *Base = V; while (auto *II = dyn_cast<InsertElementInst>(Base)) { + Base = II->getOperand(0); if (isa<T>(II->getOperand(1))) continue; - Base = II->getOperand(0); std::optional<unsigned> Idx = getInsertIndex(II); if (!Idx) continue; @@ -461,7 +467,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; - Mask.assign(VL.size(), UndefMaskElem); + Mask.assign(VL.size(), PoisonMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { // Undef can be represented as an undef element in a vector. if (isa<UndefValue>(VL[I])) @@ -533,6 +539,117 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) { return *EI->idx_begin(); } +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +static std::optional<TTI::ShuffleKind> +tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, + SmallVectorImpl<int> &Mask) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MapVector<Value *, SmallVector<int>> VectorOpToIdx; + SmallVector<int> UndefVectorExtracts; + for (int I = 0, E = VL.size(); I < E; ++I) { + auto *EI = dyn_cast<ExtractElementInst>(VL[I]); + if (!EI) { + if (isa<UndefValue>(VL[I])) + UndefVectorExtracts.push_back(I); + continue; + } + auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); + if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) + continue; + std::optional<unsigned> Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { + UndefVectorExtracts.push_back(I); + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in extractelements. + MapVector<unsigned, SmallVector<Value *>> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); + } + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair<Value *, Value *> PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) + return std::nullopt; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector<Value *> SavedVL(VL.begin(), VL.end()); + SmallVector<Value *> GatheredExtracts( + VL.size(), PoisonValue::get(VL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], VL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + std::optional<TTI::ShuffleKind> Res = + isFixedVectorShuffle(GatheredExtracts, Mask); + if (!Res) { + // TODO: try to check other subsets if possible. + // Restore the original VL if attempt was not successful. + VL.swap(SavedVL); + return std::nullopt; + } + // Restore unused scalars from mask, if some of the extractelements were not + // selected for shuffle. + for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { + auto *EI = dyn_cast<ExtractElementInst>(VL[I]); + if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || + !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I])) + std::swap(VL[I], GatheredExtracts[I]); + } + return Res; +} + namespace { /// Main data required for vectorization of instructions. @@ -829,18 +946,29 @@ static bool isSimple(Instruction *I) { } /// Shuffles \p Mask in accordance with the given \p SubMask. -static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { +/// \param ExtendingManyInputs Supports reshuffling of the mask with not only +/// one but two input vectors. +static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, + bool ExtendingManyInputs = false) { if (SubMask.empty()) return; + assert( + (!ExtendingManyInputs || SubMask.size() > Mask.size() || + // Check if input scalars were extended to match the size of other node. + (SubMask.size() == Mask.size() && + std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(), + [](int Idx) { return Idx == PoisonMaskElem; }))) && + "SubMask with many inputs support must be larger than the mask."); if (Mask.empty()) { Mask.append(SubMask.begin(), SubMask.end()); return; } - SmallVector<int> NewMask(SubMask.size(), UndefMaskElem); + SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem); int TermValue = std::min(Mask.size(), SubMask.size()); for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) + if (SubMask[I] == PoisonMaskElem || + (!ExtendingManyInputs && + (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue))) continue; NewMask[I] = Mask[SubMask[I]]; } @@ -887,7 +1015,7 @@ static void inversePermutation(ArrayRef<unsigned> Indices, SmallVectorImpl<int> &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E, UndefMaskElem); + Mask.resize(E, PoisonMaskElem); for (unsigned I = 0; I < E; ++I) Mask[Indices[I]] = I; } @@ -900,7 +1028,7 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars, UndefValue::get(Scalars.front()->getType())); Prev.swap(Scalars); for (unsigned I = 0, E = Prev.size(); I < E; ++I) - if (Mask[I] != UndefMaskElem) + if (Mask[I] != PoisonMaskElem) Scalars[Mask[I]] = Prev[I]; } @@ -962,6 +1090,7 @@ namespace slpvectorizer { class BoUpSLP { struct TreeEntry; struct ScheduleData; + class ShuffleCostEstimator; class ShuffleInstructionBuilder; public: @@ -1006,8 +1135,12 @@ public: /// Vectorize the tree but with the list of externally used values \p /// ExternallyUsedValues. Values in this MapVector can be replaced but the /// generated extractvalue instructions. - Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, - Instruction *ReductionRoot = nullptr); + /// \param ReplacedExternals containd list of replaced external values + /// {scalar, replace} after emitting extractelement for external uses. + Value * + vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, + SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, + Instruction *ReductionRoot = nullptr); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -1025,24 +1158,18 @@ public: /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef<Value *> Roots); - /// Checks if the very first tree node is going to be vectorized. - bool isVectorizedFirstNode() const { - return !VectorizableTree.empty() && - VectorizableTree.front()->State == TreeEntry::Vectorize; - } - - /// Returns the main instruction for the very first node. - Instruction *getFirstNodeMainOp() const { - assert(!VectorizableTree.empty() && "No tree to get the first node from"); - return VectorizableTree.front()->getMainOp(); - } - /// Returns whether the root node has in-tree uses. bool doesRootHaveInTreeUses() const { return !VectorizableTree.empty() && !VectorizableTree.front()->UserTreeIndices.empty(); } + /// Return the scalars of the root node. + ArrayRef<Value *> getRootNodeScalars() const { + assert(!VectorizableTree.empty() && "No graph to get the first node from"); + return VectorizableTree.front()->Scalars; + } + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -1064,6 +1191,8 @@ public: MinBWs.clear(); InstrElementSize.clear(); UserIgnoreList = nullptr; + PostponedGathers.clear(); + ValueToGatherNodes.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -1083,9 +1212,12 @@ public: /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. + /// \return std::nullopt if ordering is not important, empty order, if + /// identity order is important, or the actual order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. - std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + std::optional<OrdersType> getReorderingData(const TreeEntry &TE, + bool TopToBottom); /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes @@ -1328,8 +1460,14 @@ public: ConstantInt *Ex1Idx; if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { // Undefs are always profitable for extractelements. + // Compiler can easily combine poison and extractelement <non-poison> or + // undef and extractelement <poison>. But combining undef + + // extractelement <non-poison-but-may-produce-poison> requires some + // extra operations. if (isa<UndefValue>(V2)) - return LookAheadHeuristics::ScoreConsecutiveExtracts; + return (isa<PoisonValue>(V2) || isUndefVector(EV1).all()) + ? LookAheadHeuristics::ScoreConsecutiveExtracts + : LookAheadHeuristics::ScoreSameOpcode; Value *EV2 = nullptr; ConstantInt *Ex2Idx = nullptr; if (match(V2, @@ -1683,9 +1821,10 @@ public: // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return std::nullopt. - std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane, - ArrayRef<ReorderingMode> ReorderingModes, - ArrayRef<Value *> MainAltOps) { + std::optional<unsigned> + getBestOperand(unsigned OpIdx, int Lane, int LastLane, + ArrayRef<ReorderingMode> ReorderingModes, + ArrayRef<Value *> MainAltOps) { unsigned NumOperands = getNumOperands(); // The operand of the previous lane at OpIdx. @@ -2299,7 +2438,8 @@ private: /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, - ArrayRef<Value *> VectorizedVals); + ArrayRef<Value *> VectorizedVals, + SmallPtrSetImpl<Value *> &CheckedExtracts); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, @@ -2323,15 +2463,13 @@ private: /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. - Value *createBuildVector(const TreeEntry *E); + template <typename BVTy, typename ResTy, typename... Args> + ResTy processBuildVector(const TreeEntry *E, Args &...Params); - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. If \p - /// NeedToShuffle is true, need to add a cost of reshuffling some of the - /// vector elements. - InstructionCost getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const; + /// Create a new vector from a list of scalar values. Produces a sequence + /// which exploits values reused across lanes, and arranges the inserts + /// for ease of later optimization. + Value *createBuildVector(const TreeEntry *E); /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except @@ -2354,14 +2492,16 @@ private: /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - InstructionCost getGatherCost(ArrayRef<Value *> VL) const; + /// \param ForPoisonSrc true if initial vector is poison, false otherwise. + InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const; /// Set the Builder insert point to one after the last instruction in /// the bundle void setInsertPointAfterBundle(const TreeEntry *E); - /// \returns a vector from a collection of scalars in \p VL. - Value *gather(ArrayRef<Value *> VL); + /// \returns a vector from a collection of scalars in \p VL. if \p Root is not + /// specified, the starting vector value is poison. + Value *gather(ArrayRef<Value *> VL, Value *Root); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -2400,6 +2540,14 @@ private: using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} + /// \returns Common mask for reorder indices and reused scalars. + SmallVector<int> getCommonMask() const { + SmallVector<int> Mask; + inversePermutation(ReorderIndices, Mask); + ::addMask(Mask, ReuseShuffleIndices); + return Mask; + } + /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { @@ -2409,8 +2557,8 @@ private: std::equal(VL.begin(), VL.end(), Mask.begin(), [Scalars](Value *V, int Idx) { return (isa<UndefValue>(V) && - Idx == UndefMaskElem) || - (Idx != UndefMaskElem && V == Scalars[Idx]); + Idx == PoisonMaskElem) || + (Idx != PoisonMaskElem && V == Scalars[Idx]); }); }; if (!ReorderIndices.empty()) { @@ -2471,7 +2619,7 @@ private: ValueList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. - Value *VectorizedValue = nullptr; + WeakTrackingVH VectorizedValue = nullptr; /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather @@ -2684,20 +2832,22 @@ private: #ifndef NDEBUG void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, - InstructionCost VecCost, - InstructionCost ScalarCost) const { - dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); + InstructionCost VecCost, InstructionCost ScalarCost, + StringRef Banner) const { + dbgs() << "SLP: " << Banner << ":\n"; + E->dump(); dbgs() << "SLP: Costs:\n"; dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; dbgs() << "SLP: VectorCost = " << VecCost << "\n"; dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; - dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << - ReuseShuffleCost + VecCost - ScalarCost << "\n"; + dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " + << ReuseShuffleCost + VecCost - ScalarCost << "\n"; } #endif /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle, + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, + std::optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<int> ReuseShuffleIndices = std::nullopt, @@ -2791,8 +2941,14 @@ private: return ScalarToTreeEntry.lookup(V); } + /// Checks if the specified list of the instructions/values can be vectorized + /// and fills required data before actual scheduling of the instructions. + TreeEntry::EntryState getScalarsVectorizationState( + InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, + OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const; + /// Maps a specific scalar to its tree entry. - SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; + SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry; /// Maps a value to the proposed vectorizable size. SmallDenseMap<Value *, unsigned> InstrElementSize; @@ -2808,6 +2964,15 @@ private: /// pre-gather them before. DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction; + /// List of gather nodes, depending on other gather/vector nodes, which should + /// be emitted after the vector instruction emission process to correctly + /// handle order of the vector instructions and shuffles. + SetVector<const TreeEntry *> PostponedGathers; + + using ValueToGatherNodesMap = + DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>; + ValueToGatherNodesMap ValueToGatherNodes; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -3235,7 +3400,6 @@ private: << "SLP: gets ready (ctl): " << *DepBundle << "\n"); } } - } } @@ -3579,7 +3743,7 @@ static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { SmallVector<int> Prev(Reuses.begin(), Reuses.end()); Prev.swap(Reuses); for (unsigned I = 0, E = Prev.size(); I < E; ++I) - if (Mask[I] != UndefMaskElem) + if (Mask[I] != PoisonMaskElem) Reuses[Mask[I]] = Prev[I]; } @@ -3603,7 +3767,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { } Order.assign(Mask.size(), Mask.size()); for (unsigned I = 0, E = Mask.size(); I < E; ++I) - if (MaskOrder[I] != UndefMaskElem) + if (MaskOrder[I] != PoisonMaskElem) Order[MaskOrder[I]] = I; fixupOrderingIndices(Order); } @@ -3653,10 +3817,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return false; return true; }; - if (IsIdentityOrder(CurrentOrder)) { - CurrentOrder.clear(); - return CurrentOrder; - } + if (IsIdentityOrder(CurrentOrder)) + return OrdersType(); auto *It = CurrentOrder.begin(); for (unsigned I = 0; I < NumScalars;) { if (UsedPositions.test(I)) { @@ -3669,7 +3831,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { } ++It; } - return CurrentOrder; + return std::move(CurrentOrder); } return std::nullopt; } @@ -3779,9 +3941,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, return LoadsState::Gather; } -bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl<unsigned> &SortedIndices) { +static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &SortedIndices) { assert(llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."); @@ -3825,7 +3987,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, return std::get<1>(X) < std::get<1>(Y); }); int InitialOffset = std::get<1>(Vec[0]); - AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) { + AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { return std::get<1>(P.value()) == int(P.index()) + InitialOffset; }); } @@ -3862,7 +4024,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { BoUpSLP::OrdersType Order; if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) - return Order; + return std::move(Order); return std::nullopt; } @@ -3888,31 +4050,35 @@ static bool areTwoInsertFromSameBuildVector( // Go through the vector operand of insertelement instructions trying to find // either VU as the original vector for IE2 or V as the original vector for // IE1. + SmallSet<int, 8> ReusedIdx; + bool IsReusedIdx = false; do { - if (IE2 == VU) + if (IE2 == VU && !IE1) return VU->hasOneUse(); - if (IE1 == V) + if (IE1 == V && !IE2) return V->hasOneUse(); - if (IE1) { - if ((IE1 != VU && !IE1->hasOneUse()) || - getInsertIndex(IE1).value_or(*Idx2) == *Idx2) + if (IE1 && IE1 != V) { + IsReusedIdx |= + !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second; + if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) IE1 = nullptr; else IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); } - if (IE2) { - if ((IE2 != V && !IE2->hasOneUse()) || - getInsertIndex(IE2).value_or(*Idx1) == *Idx1) + if (IE2 && IE2 != VU) { + IsReusedIdx |= + !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second; + if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) IE2 = nullptr; else IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2)); } - } while (IE1 || IE2); + } while (!IsReusedIdx && (IE1 || IE2)); return false; } -std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, - bool TopToBottom) { +std::optional<BoUpSLP::OrdersType> +BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { @@ -3936,14 +4102,14 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V)); return Idx && *Idx < Sz; })) { - SmallVector<int> ReorderMask(Sz, UndefMaskElem); + SmallVector<int> ReorderMask(Sz, PoisonMaskElem); if (TE.ReorderIndices.empty()) std::iota(ReorderMask.begin(), ReorderMask.end(), 0); else inversePermutation(TE.ReorderIndices, ReorderMask); for (unsigned I = 0; I < VF; ++I) { int &Idx = ReusedMask[I]; - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; Value *V = TE.Scalars[ReorderMask[Idx]]; std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V)); @@ -3958,7 +4124,7 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T for (unsigned K = 0; K < VF; K += Sz) { OrdersType CurrentOrder(TE.ReorderIndices); SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; - if (SubMask.front() == UndefMaskElem) + if (SubMask.front() == PoisonMaskElem) std::iota(SubMask.begin(), SubMask.end(), 0); reorderOrder(CurrentOrder, SubMask); transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); @@ -3966,8 +4132,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T } if (all_of(enumerate(ResOrder), [](const auto &Data) { return Data.index() == Data.value(); })) - return {}; // Use identity order. - return ResOrder; + return std::nullopt; // No need to reorder. + return std::move(ResOrder); } if (TE.State == TreeEntry::Vectorize && (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || @@ -3976,6 +4142,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T return TE.ReorderIndices; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { + if (V1 == V2) + return false; if (!V1->hasOneUse() || !V2->hasOneUse()) return false; auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin()); @@ -4023,8 +4191,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) ResOrder[Id] = PhiToId[Phis[Id]]; if (IsIdentityOrder(ResOrder)) - return {}; - return ResOrder; + return std::nullopt; // No need to reorder. + return std::move(ResOrder); } if (TE.State == TreeEntry::NeedToGather) { // TODO: add analysis of other gather nodes with extractelement @@ -4050,7 +4218,42 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T if (Reuse || !CurrentOrder.empty()) { if (!CurrentOrder.empty()) fixupOrderingIndices(CurrentOrder); - return CurrentOrder; + return std::move(CurrentOrder); + } + } + // If the gather node is <undef, v, .., poison> and + // insertelement poison, v, 0 [+ permute] + // is cheaper than + // insertelement poison, v, n - try to reorder. + // If rotating the whole graph, exclude the permute cost, the whole graph + // might be transformed. + int Sz = TE.Scalars.size(); + if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) && + count_if(TE.Scalars, UndefValue::classof) == Sz - 1) { + const auto *It = + find_if(TE.Scalars, [](Value *V) { return !isConstant(V); }); + if (It == TE.Scalars.begin()) + return OrdersType(); + auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz); + if (It != TE.Scalars.end()) { + OrdersType Order(Sz, Sz); + unsigned Idx = std::distance(TE.Scalars.begin(), It); + Order[Idx] = 0; + fixupOrderingIndices(Order); + SmallVector<int> Mask; + inversePermutation(Order, Mask); + InstructionCost PermuteCost = + TopToBottom + ? 0 + : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask); + InstructionCost InsertFirstCost = TTI->getVectorInstrCost( + Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0, + PoisonValue::get(Ty), *It); + InstructionCost InsertIdxCost = TTI->getVectorInstrCost( + Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, + PoisonValue::get(Ty), *It); + if (InsertFirstCost + PermuteCost < InsertIdxCost) + return std::move(Order); } } if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) @@ -4260,7 +4463,7 @@ void BoUpSLP::reorderTopToBottom() { unsigned E = Order.size(); OrdersType CurrentOrder(E, E); transform(Mask, CurrentOrder.begin(), [E](int Idx) { - return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); + return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; @@ -4285,10 +4488,10 @@ void BoUpSLP::reorderTopToBottom() { continue; SmallVector<int> Mask; inversePermutation(BestOrder, Mask); - SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); + SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); unsigned E = BestOrder.size(); transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { - return I < E ? static_cast<int>(I) : UndefMaskElem; + return I < E ? static_cast<int>(I) : PoisonMaskElem; }); // Do an actual reordering, if profitable. for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { @@ -4384,7 +4587,7 @@ bool BoUpSLP::canReorderOperands( } return false; }) > 1 && - !all_of(UserTE->getOperand(I), isConstant)) + !allConstant(UserTE->getOperand(I))) return false; if (Gather) GatherOps.push_back(Gather); @@ -4499,7 +4702,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { unsigned E = Order.size(); OrdersType CurrentOrder(E, E); transform(Mask, CurrentOrder.begin(), [E](int Idx) { - return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); + return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += @@ -4578,10 +4781,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { VisitedOps.clear(); SmallVector<int> Mask; inversePermutation(BestOrder, Mask); - SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); + SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); unsigned E = BestOrder.size(); transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { - return I < E ? static_cast<int>(I) : UndefMaskElem; + return I < E ? static_cast<int>(I) : PoisonMaskElem; }); for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { TreeEntry *TE = Op.second; @@ -4779,7 +4982,7 @@ bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, // Check if the stores are consecutive by checking if their difference is 1. for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size())) - if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1) + if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) return false; // Calculate the shuffle indices according to their offset against the sorted @@ -4976,6 +5179,309 @@ static bool isAlternateInstruction(const Instruction *I, const Instruction *AltOp, const TargetLibraryInfo &TLI); +BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( + InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, + OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const { + assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); + + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); + auto *VL0 = cast<Instruction>(S.OpValue); + switch (ShuffleOrOp) { + case Instruction::PHI: { + // Check for terminator values (e.g. invoke). + for (Value *V : VL) + for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { + Instruction *Term = dyn_cast<Instruction>(Incoming); + if (Term && Term->isTerminator()) { + LLVM_DEBUG(dbgs() + << "SLP: Need to swizzle PHINodes (terminator use).\n"); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::ExtractValue: + case Instruction::ExtractElement: { + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + if (Reuse || !CurrentOrder.empty()) + return TreeEntry::Vectorize; + LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); + return TreeEntry::NeedToGather; + } + case Instruction::InsertElement: { + // Check that we have a buildvector and not a shuffle of 2 or more + // different vectors. + ValueSet SourceVectors; + for (Value *V : VL) { + SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); + assert(getInsertIndex(V) != std::nullopt && + "Non-constant or undef index?"); + } + + if (count_if(VL, [&SourceVectors](Value *V) { + return !SourceVectors.contains(V); + }) >= 2) { + // Found 2nd source vector - cancel. + LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " + "different source vectors.\n"); + return TreeEntry::NeedToGather; + } + + return TreeEntry::Vectorize; + } + case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder, + PointerOps)) { + case LoadsState::Vectorize: + return TreeEntry::Vectorize; + case LoadsState::ScatterVectorize: + return TreeEntry::ScatterVectorize; + case LoadsState::Gather: +#ifndef NDEBUG + Type *ScalarTy = VL0->getType(); + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + else if (any_of(VL, + [](Value *V) { return !cast<LoadInst>(V)->isSimple(); })) + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + else + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +#endif // NDEBUG + return TreeEntry::NeedToGather; + } + llvm_unreachable("Unexpected state of loads"); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (Value *V : VL) { + Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); + if (Ty != SrcTy || !isValidElementType(Ty)) { + LLVM_DEBUG( + dbgs() << "SLP: Gathering casts with different src types.\n"); + return TreeEntry::NeedToGather; + } + } + return TreeEntry::Vectorize; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); + Type *ComparedTy = VL0->getOperand(0)->getType(); + for (Value *V : VL) { + CmpInst *Cmp = cast<CmpInst>(V); + if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || + Cmp->getOperand(0)->getType() != ComparedTy) { + LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return TreeEntry::NeedToGather; + } + } + return TreeEntry::Vectorize; + } + case Instruction::Select: + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TreeEntry::Vectorize; + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (Value *V : VL) { + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) + continue; + if (I->getNumOperands() != 2) { + LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + return TreeEntry::NeedToGather; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); + for (Value *V : VL) { + auto *GEP = dyn_cast<GEPOperator>(V); + if (!GEP) + continue; + Type *CurTy = GEP->getSourceElementType(); + if (Ty0 != CurTy) { + LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + return TreeEntry::NeedToGather; + } + } + + // We don't combine GEPs with non-constant indexes. + Type *Ty1 = VL0->getOperand(1)->getType(); + for (Value *V : VL) { + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) + continue; + auto *Op = I->getOperand(1); + if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || + (Op->getType() != Ty1 && + ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace())))) { + LLVM_DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::Store: { + // Check if the stores are consecutive or if we need to swizzle them. + llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); + // Avoid types that are padded when being allocated as scalars, while + // being packed together in a vector (such as i1). + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); + return TreeEntry::NeedToGather; + } + // Make sure all stores in the bundle are simple - we can't vectorize + // atomic or volatile stores. + for (Value *V : VL) { + auto *SI = cast<StoreInst>(V); + if (!SI->isSimple()) { + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); + return TreeEntry::NeedToGather; + } + PointerOps.push_back(SI->getPointerOperand()); + } + + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + std::optional<int> Dist = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); + // Check that the sorted pointer operands are consecutive. + if (static_cast<unsigned>(*Dist) == VL.size() - 1) + return TreeEntry::Vectorize; + } + + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return TreeEntry::NeedToGather; + } + case Instruction::Call: { + // Check if the calls are all to the same vectorizable intrinsic or + // library function. + CallInst *CI = cast<CallInst>(VL0); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + VFShape Shape = VFShape::get( + *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!VecFunc && !isTriviallyVectorizable(ID)) { + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return TreeEntry::NeedToGather; + } + Function *F = CI->getCalledFunction(); + unsigned NumArgs = CI->arg_size(); + SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); + for (unsigned J = 0; J != NumArgs; ++J) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) + ScalarArgs[J] = CI->getArgOperand(J); + for (Value *V : VL) { + CallInst *CI2 = dyn_cast<CallInst>(V); + if (!CI2 || CI2->getCalledFunction() != F || + getVectorIntrinsicIDForCall(CI2, TLI) != ID || + (VecFunc && + VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || + !CI->hasIdenticalOperandBundleSchema(*CI2)) { + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V + << "\n"); + return TreeEntry::NeedToGather; + } + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized. + for (unsigned J = 0; J != NumArgs; ++J) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { + Value *A1J = CI2->getArgOperand(J); + if (ScalarArgs[J] != A1J) { + LLVM_DEBUG(dbgs() + << "SLP: mismatched arguments in call:" << *CI + << " argument " << ScalarArgs[J] << "!=" << A1J << "\n"); + return TreeEntry::NeedToGather; + } + } + } + // Verify that the bundle operands are identical between the two calls. + if (CI->hasOperandBundles() && + !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), + CI->op_begin() + CI->getBundleOperandsEndIndex(), + CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { + LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI + << "!=" << *V << '\n'); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!S.isAltShuffle()) { + LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return TreeEntry::NeedToGather; + } + return TreeEntry::Vectorize; + } + default: + LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return TreeEntry::NeedToGather; + } +} + void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -4990,7 +5496,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *V : VL) { if (isConstant(V)) { ReuseShuffleIndicies.emplace_back( - isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); + isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size()); UniqueValues.emplace_back(V); continue; } @@ -5010,7 +5516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return isa<UndefValue>(V) || !isConstant(V); })) || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; @@ -5257,6 +5763,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!TryToFindDuplicates(S)) return; + // Perform specific checks for each particular instruction kind. + OrdersType CurrentOrder; + SmallVector<Value *> PointerOps; + TreeEntry::EntryState State = getScalarsVectorizationState( + S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); + if (State == TreeEntry::NeedToGather) { + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + auto &BSRef = BlocksSchedules[BB]; if (!BSRef) BSRef = std::make_unique<BlockScheduling>(BB); @@ -5285,20 +5802,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::PHI: { auto *PH = cast<PHINode>(VL0); - // Check for terminator values (e.g. invoke). - for (Value *V : VL) - for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { - Instruction *Term = dyn_cast<Instruction>(Incoming); - if (Term && Term->isTerminator()) { - LLVM_DEBUG(dbgs() - << "SLP: Need to swizzle PHINodes (terminator use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); @@ -5326,9 +5829,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::ExtractValue: case Instruction::ExtractElement: { - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); - if (Reuse) { + if (CurrentOrder.empty()) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -5339,55 +5840,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, VectorizableTree.back()->setOperand(0, Op0); return; } - if (!CurrentOrder.empty()) { - LLVM_DEBUG({ - dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " - "with order"; - for (unsigned Idx : CurrentOrder) - dbgs() << " " << Idx; - dbgs() << "\n"; - }); - fixupOrderingIndices(CurrentOrder); - // Insert new order with initial value 0, if it does not exist, - // otherwise return the iterator to the existing one. - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - // This is a special case, as it does not gather, but at the same time - // we are not extending buildTree_rec() towards the operands. - ValueList Op0; - Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0); - return; - } - LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + LLVM_DEBUG({ + dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " + "with order"; + for (unsigned Idx : CurrentOrder) + dbgs() << " " << Idx; + dbgs() << "\n"; + }); + fixupOrderingIndices(CurrentOrder); + // Insert new order with initial value 0, if it does not exist, + // otherwise return the iterator to the existing one. + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0); return; } case Instruction::InsertElement: { assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); - // Check that we have a buildvector and not a shuffle of 2 or more - // different vectors. - ValueSet SourceVectors; - for (Value *V : VL) { - SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); - assert(getInsertIndex(V) != std::nullopt && - "Non-constant or undef index?"); - } - - if (count_if(VL, [&SourceVectors](Value *V) { - return !SourceVectors.contains(V); - }) >= 2) { - // Found 2nd source vector - cancel. - LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " - "different source vectors.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - BS.cancelScheduling(VL, VL0); - return; - } - auto OrdCompare = [](const std::pair<int, int> &P1, const std::pair<int, int> &P2) { return P1.first > P2.first; @@ -5430,12 +5904,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - SmallVector<Value *> PointerOps; - OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, - CurrentOrder, PointerOps)) { - case LoadsState::Vectorize: + switch (State) { + case TreeEntry::Vectorize: if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, @@ -5450,7 +5921,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } TE->setOperandsInOrder(); break; - case LoadsState::ScatterVectorize: + case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); @@ -5458,23 +5929,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, buildTree_rec(PointerOps, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); break; - case LoadsState::Gather: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); -#ifndef NDEBUG - Type *ScalarTy = VL0->getType(); - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - else if (any_of(VL, [](Value *V) { - return !cast<LoadInst>(V)->isSimple(); - })) - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - else - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); -#endif // NDEBUG - break; + case TreeEntry::NeedToGather: + llvm_unreachable("Unexpected loads state."); } return; } @@ -5490,18 +5946,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { - Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); - if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() - << "SLP: Gathering casts with different src types.\n"); - return; - } - } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); @@ -5521,21 +5965,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); - CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); - Type *ComparedTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { - CmpInst *Cmp = cast<CmpInst>(V); - if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || - Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() - << "SLP: Gathering cmp with different predicate.\n"); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); @@ -5544,7 +5973,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (cast<CmpInst>(VL0)->isCommutative()) { // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. - assert(P0 == SwapP0 && "Commutative Predicate mismatch"); + assert(P0 == CmpInst::getSwappedPredicate(P0) && + "Commutative Predicate mismatch"); reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. @@ -5612,60 +6042,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } case Instruction::GetElementPtr: { - // We don't combine GEPs with complicated (nested) indexing. - for (Value *V : VL) { - auto *I = dyn_cast<GetElementPtrInst>(V); - if (!I) - continue; - if (I->getNumOperands() != 2) { - LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - - // We can't combine several GEPs into one vector if they operate on - // different types. - Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); - for (Value *V : VL) { - auto *GEP = dyn_cast<GEPOperator>(V); - if (!GEP) - continue; - Type *CurTy = GEP->getSourceElementType(); - if (Ty0 != CurTy) { - LLVM_DEBUG(dbgs() - << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - - // We don't combine GEPs with non-constant indexes. - Type *Ty1 = VL0->getOperand(1)->getType(); - for (Value *V : VL) { - auto *I = dyn_cast<GetElementPtrInst>(V); - if (!I) - continue; - auto *Op = I->getOperand(1); - if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || - (Op->getType() != Ty1 && - ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || - Op->getType()->getScalarSizeInBits() > - DL->getIndexSizeInBits( - V->getType()->getPointerAddressSpace())))) { - LLVM_DEBUG(dbgs() - << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); @@ -5722,78 +6098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. - llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); - // Avoid types that are padded when being allocated as scalars, while - // being packed together in a vector (such as i1). - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); - return; - } - // Make sure all stores in the bundle are simple - we can't vectorize - // atomic or volatile stores. - SmallVector<Value *, 4> PointerOps(VL.size()); ValueList Operands(VL.size()); - auto POIter = PointerOps.begin(); - auto OIter = Operands.begin(); + auto *OIter = Operands.begin(); for (Value *V : VL) { auto *SI = cast<StoreInst>(V); - if (!SI->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); - return; - } - *POIter = SI->getPointerOperand(); *OIter = SI->getValueOperand(); - ++POIter; ++OIter; } - - OrdersType CurrentOrder; - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; - if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - std::optional<int> Dist = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); - // Check that the sorted pointer operands are consecutive. - if (static_cast<unsigned>(*Dist) == VL.size() - 1) { - if (CurrentOrder.empty()) { - // Original stores are consecutive and does not require reordering. - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); - } else { - fixupOrderingIndices(CurrentOrder); - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - } - return; - } + // Check that the sorted pointer operands are consecutive. + if (CurrentOrder.empty()) { + // Original stores are consecutive and does not require reordering. + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + } else { + fixupOrderingIndices(CurrentOrder); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); } - - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -5802,68 +6129,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CallInst *CI = cast<CallInst>(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - VFShape Shape = VFShape::get( - *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), - false /*HasGlobalPred*/); - Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - - if (!VecFunc && !isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); - return; - } - Function *F = CI->getCalledFunction(); - unsigned NumArgs = CI->arg_size(); - SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); - for (unsigned j = 0; j != NumArgs; ++j) - if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) - ScalarArgs[j] = CI->getArgOperand(j); - for (Value *V : VL) { - CallInst *CI2 = dyn_cast<CallInst>(V); - if (!CI2 || CI2->getCalledFunction() != F || - getVectorIntrinsicIDForCall(CI2, TLI) != ID || - (VecFunc && - VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || - !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V - << "\n"); - return; - } - // Some intrinsics have scalar arguments and should be same in order for - // them to be vectorized. - for (unsigned j = 0; j != NumArgs; ++j) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) { - Value *A1J = CI2->getArgOperand(j); - if (ScalarArgs[j] != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI - << " argument " << ScalarArgs[j] << "!=" << A1J - << "\n"); - return; - } - } - } - // Verify that the bundle operands are identical between the two calls. - if (CI->hasOperandBundles() && - !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), - CI->op_begin() + CI->getBundleOperandsEndIndex(), - CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *V << '\n'); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); @@ -5883,15 +6148,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } case Instruction::ShuffleVector: { - // If this is not an alternate sequence of opcode like add-sub - // then do not vectorize this instruction. - if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); - return; - } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); @@ -5949,19 +6205,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); - return; + break; } + llvm_unreachable("Unexpected vectorization of the instructions."); } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; - while (isa<StructType, ArrayType, VectorType>(EltTy)) { + while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) { if (auto *ST = dyn_cast<StructType>(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) @@ -5982,7 +6235,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { if (!isValidElementType(EltTy)) return 0; uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); - if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) + if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || + VTSize != DL.getTypeStoreSizeInBits(T)) return 0; return N; } @@ -6111,68 +6365,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } -/// Compute the cost of creating a vector of type \p VecTy containing the -/// extracted values from \p VL. -static InstructionCost -computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, - TargetTransformInfo::ShuffleKind ShuffleKind, - ArrayRef<int> Mask, TargetTransformInfo &TTI) { - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts || - VecTy->getNumElements() < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - - bool AllConsecutive = true; - unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; - unsigned Idx = -1; - InstructionCost Cost = 0; - - // Process extracts in blocks of EltsPerVector to check if the source vector - // operand can be re-used directly. If not, add the cost of creating a shuffle - // to extract the values into a vector register. - SmallVector<int> RegMask(EltsPerVector, UndefMaskElem); - for (auto *V : VL) { - ++Idx; - - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, UndefMaskElem); - AllConsecutive = true; - continue; - } - - // Need to exclude undefs from analysis. - if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) - continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; - } - - if (AllConsecutive) - continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) - continue; - - // If we have a series of extracts which are not consecutive and hence - // cannot re-use the source vector register directly, compute the shuffle - // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); - } - return Cost; -} - /// Build shuffle mask for shuffle graph entries and lists of main and alternate /// operations operands. static void @@ -6183,7 +6375,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, SmallVectorImpl<Value *> *OpScalars = nullptr, SmallVectorImpl<Value *> *AltScalars = nullptr) { unsigned Sz = VL.size(); - Mask.assign(Sz, UndefMaskElem); + Mask.assign(Sz, PoisonMaskElem); SmallVector<int> OrderMask; if (!ReorderIndices.empty()) inversePermutation(ReorderIndices, OrderMask); @@ -6203,9 +6395,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, } } if (!ReusesIndices.empty()) { - SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem); + SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem); transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { - return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem; + return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; }); Mask.swap(NewMask); } @@ -6325,13 +6517,13 @@ protected: static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) { unsigned VF = Mask.size(); - SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); + SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { - if (ExtMask[I] == UndefMaskElem) + if (ExtMask[I] == PoisonMaskElem) continue; int MaskedIdx = Mask[ExtMask[I] % VF]; NewMask[I] = - MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF; + MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF; } Mask.swap(NewMask); } @@ -6418,11 +6610,12 @@ protected: if (auto *SVOpTy = dyn_cast<FixedVectorType>(SV->getOperand(0)->getType())) LocalVF = SVOpTy->getNumElements(); - SmallVector<int> ExtMask(Mask.size(), UndefMaskElem); + SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem); for (auto [Idx, I] : enumerate(Mask)) { - if (I == UndefMaskElem) - continue; - ExtMask[Idx] = SV->getMaskValue(I); + if (I == PoisonMaskElem || + static_cast<unsigned>(I) >= SV->getShuffleMask().size()) + continue; + ExtMask[Idx] = SV->getMaskValue(I); } bool IsOp1Undef = isUndefVector(SV->getOperand(0), @@ -6435,11 +6628,11 @@ protected: if (!IsOp1Undef && !IsOp2Undef) { // Update mask and mark undef elems. for (int &I : Mask) { - if (I == UndefMaskElem) + if (I == PoisonMaskElem) continue; if (SV->getMaskValue(I % SV->getShuffleMask().size()) == - UndefMaskElem) - I = UndefMaskElem; + PoisonMaskElem) + I = PoisonMaskElem; } break; } @@ -6453,15 +6646,16 @@ protected: Op = SV->getOperand(1); } if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); - !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) { + !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || + ShuffleVectorInst::isZeroEltSplatMask(Mask)) { if (IdentityOp) { V = IdentityOp; assert(Mask.size() == IdentityMask.size() && "Expected masks of same sizes."); // Clear known poison elements. for (auto [I, Idx] : enumerate(Mask)) - if (Idx == UndefMaskElem) - IdentityMask[I] = UndefMaskElem; + if (Idx == PoisonMaskElem) + IdentityMask[I] = PoisonMaskElem; Mask.swap(IdentityMask); auto *Shuffle = dyn_cast<ShuffleVectorInst>(V); return SinglePermute && @@ -6481,10 +6675,12 @@ protected: /// Smart shuffle instruction emission, walks through shuffles trees and /// tries to find the best matching vector for the actual shuffle /// instruction. - template <typename ShuffleBuilderTy> - static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, - ShuffleBuilderTy &Builder) { + template <typename T, typename ShuffleBuilderTy> + static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, + ShuffleBuilderTy &Builder) { assert(V1 && "Expected at least one vector value."); + if (V2) + Builder.resizeToMatch(V1, V2); int VF = Mask.size(); if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) VF = FTy->getNumElements(); @@ -6495,8 +6691,8 @@ protected: Value *Op2 = V2; int VF = cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); - SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); - SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); + SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); + SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); for (int I = 0, E = Mask.size(); I < E; ++I) { if (Mask[I] < VF) CombinedMask1[I] = Mask[I]; @@ -6514,9 +6710,9 @@ protected: // again. if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) { - SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem); + SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem); for (auto [Idx, I] : enumerate(CombinedMask1)) { - if (I == UndefMaskElem) + if (I == PoisonMaskElem) continue; ExtMask1[Idx] = SV1->getMaskValue(I); } @@ -6524,9 +6720,9 @@ protected: cast<FixedVectorType>(SV1->getOperand(1)->getType()) ->getNumElements(), ExtMask1, UseMask::SecondArg); - SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem); + SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem); for (auto [Idx, I] : enumerate(CombinedMask2)) { - if (I == UndefMaskElem) + if (I == PoisonMaskElem) continue; ExtMask2[Idx] = SV2->getMaskValue(I); } @@ -6566,64 +6762,360 @@ protected: ->getElementCount() .getKnownMinValue()); for (int I = 0, E = Mask.size(); I < E; ++I) { - if (CombinedMask2[I] != UndefMaskElem) { - assert(CombinedMask1[I] == UndefMaskElem && + if (CombinedMask2[I] != PoisonMaskElem) { + assert(CombinedMask1[I] == PoisonMaskElem && "Expected undefined mask element"); CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); } } + const int Limit = CombinedMask1.size() * 2; + if (Op1 == Op2 && Limit == 2 * VF && + all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) && + (ShuffleVectorInst::isIdentityMask(CombinedMask1) || + (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) && + isa<ShuffleVectorInst>(Op1) && + cast<ShuffleVectorInst>(Op1)->getShuffleMask() == + ArrayRef(CombinedMask1)))) + return Builder.createIdentity(Op1); return Builder.createShuffleVector( Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, CombinedMask1); } if (isa<PoisonValue>(V1)) - return PoisonValue::get(FixedVectorType::get( - cast<VectorType>(V1->getType())->getElementType(), Mask.size())); + return Builder.createPoison( + cast<VectorType>(V1->getType())->getElementType(), Mask.size()); SmallVector<int> NewMask(Mask.begin(), Mask.end()); bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); assert(V1 && "Expected non-null value after looking through shuffles."); if (!IsIdentity) return Builder.createShuffleVector(V1, NewMask); - return V1; + return Builder.createIdentity(V1); } }; } // namespace -InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, - ArrayRef<Value *> VectorizedVals) { - ArrayRef<Value *> VL = E->Scalars; +/// Merges shuffle masks and emits final shuffle instruction, if required. It +/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, +/// when the actual shuffle instruction is generated only if this is actually +/// required. Otherwise, the shuffle instruction emission is delayed till the +/// end of the process, to reduce the number of emitted instructions and further +/// analysis/transformations. +class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { + bool IsFinalized = false; + SmallVector<int> CommonMask; + SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; + const TargetTransformInfo &TTI; + InstructionCost Cost = 0; + ArrayRef<Value *> VectorizedVals; + BoUpSLP &R; + SmallPtrSetImpl<Value *> &CheckedExtracts; + constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { + if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof)) + return TTI::TCC_Free; + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); + InstructionCost GatherCost = 0; + SmallVector<Value *> Gathers(VL.begin(), VL.end()); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + InstructionsState S = getSameOpcode(VL, *R.TLI); + if (VL.size() > 2 && S.getOpcode() == Instruction::Load && + !S.isAltShuffle() && + !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && + !isSplat(Gathers)) { + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType()); + for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef<Value *> Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector<Value *> PointerOps; + OrdersType CurrentOrder; + LoadsState LS = + canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE, + *R.LI, *R.TLI, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + unsigned NumParts = TTI.getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += getBuildVectorCost(VL.slice(I, VF), Root); + } + // Exclude potentially vectorized loads from list of gathered + // scalars. + auto *LI = cast<LoadInst>(S.MainOp); + Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType())); + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast<LoadInst>(V); + ScalarsCost += + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), + LI->getAlign(), LI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), LI); + } + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI); + GatherCost += ScatterVectorizeCnt * + TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + for (int I = VF, E = VL.size(); I < E; I += VF) + GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, + std::nullopt, CostKind, I, LoadTy); + } + GatherCost -= ScalarsCost; + } + } else if (!Root && isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as + // the broadcast. + const auto *It = + find_if(VL, [](Value *V) { return !isa<UndefValue>(V); }); + assert(It != VL.end() && "Expected at least one non-undef value."); + // Add broadcast for non-identity shuffle only. + bool NeedShuffle = + count(VL, *It) > 1 && + (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof)); + InstructionCost InsertCost = TTI.getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, + NeedShuffle ? 0 : std::distance(VL.begin(), It), + PoisonValue::get(VecTy), *It); + return InsertCost + + (NeedShuffle ? TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, VecTy, + /*Mask=*/std::nullopt, CostKind, /*Index=*/0, + /*SubTp=*/nullptr, /*Args=*/*It) + : TTI::TCC_Free); + } + return GatherCost + + (all_of(Gathers, UndefValue::classof) + ? TTI::TCC_Free + : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); + }; - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) - ScalarTy = IE->getOperand(1)->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + /// Compute the cost of creating a vector of type \p VecTy containing the + /// extracted values from \p VL. + InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, + TTI::ShuffleKind ShuffleKind) { + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); + unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = FixedVectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - unsigned EntryVF = E->getVectorFactor(); - auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || + !NumOfParts || VecTy->getNumElements() < NumOfParts) + return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - // FIXME: it tries to fix a problem with MSVC buildbots. - TargetTransformInfo *TTI = this->TTI; - auto AdjustExtractsCost = [=](InstructionCost &Cost) { + bool AllConsecutive = true; + unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; + unsigned Idx = -1; + InstructionCost Cost = 0; + + // Process extracts in blocks of EltsPerVector to check if the source vector + // operand can be re-used directly. If not, add the cost of creating a + // shuffle to extract the values into a vector register. + SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem); + for (auto *V : VL) { + ++Idx; + + // Reached the start of a new vector registers. + if (Idx % EltsPerVector == 0) { + RegMask.assign(EltsPerVector, PoisonMaskElem); + AllConsecutive = true; + continue; + } + + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem) + continue; + + // Check all extracts for a vector register on the target directly + // extract values in order. + unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); + if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; + } + + if (AllConsecutive) + continue; + + // Skip all indices, except for the last index per vector block. + if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) + continue; + + // If we have a series of extracts which are not consecutive and hence + // cannot re-use the source vector register directly, compute the shuffle + // cost to extract the vector with EltsPerVector elements. + Cost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, + FixedVectorType::get(VecTy->getElementType(), EltsPerVector), + RegMask); + } + return Cost; + } + + class ShuffleCostBuilder { + const TargetTransformInfo &TTI; + + static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { + int Limit = 2 * VF; + return Mask.empty() || + (VF == Mask.size() && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask)); + } + + public: + ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} + ~ShuffleCostBuilder() = default; + InstructionCost createShuffleVector(Value *V1, Value *, + ArrayRef<int> Mask) const { + // Empty mask or identity mask are free. + unsigned VF = + cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); + if (isEmptyOrIdentity(Mask, VF)) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteTwoSrc, + FixedVectorType::get( + cast<VectorType>(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { + // Empty mask or identity mask are free. + if (isEmptyOrIdentity(Mask, Mask.size())) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, + FixedVectorType::get( + cast<VectorType>(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } + InstructionCost createPoison(Type *Ty, unsigned VF) const { + return TTI::TCC_Free; + } + void resizeToMatch(Value *&, Value *&) const {} + }; + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + InstructionCost + createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1, + const PointerUnion<Value *, const TreeEntry *> &P2, + ArrayRef<int> Mask) { + ShuffleCostBuilder Builder(TTI); + Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); + unsigned CommonVF = 0; + if (!V1) { + const TreeEntry *E = P1.get<const TreeEntry *>(); + unsigned VF = E->getVectorFactor(); + if (V2) { + unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); + if (V2VF != VF && V2VF == E->Scalars.size()) + VF = E->Scalars.size(); + } else if (!P2.isNull()) { + const TreeEntry *E2 = P2.get<const TreeEntry *>(); + if (E->Scalars.size() == E2->Scalars.size()) + CommonVF = VF = E->Scalars.size(); + } else { + // P2 is empty, check that we have same node + reshuffle (if any). + if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { + VF = E->Scalars.size(); + SmallVector<int> CommonMask(Mask.begin(), Mask.end()); + ::addMask(CommonMask, E->getCommonMask()); + V1 = Constant::getNullValue( + FixedVectorType::get(E->Scalars.front()->getType(), VF)); + return BaseShuffleAnalysis::createShuffle<InstructionCost>( + V1, nullptr, CommonMask, Builder); + } + } + V1 = Constant::getNullValue( + FixedVectorType::get(E->Scalars.front()->getType(), VF)); + } + if (!V2 && !P2.isNull()) { + const TreeEntry *E = P2.get<const TreeEntry *>(); + unsigned VF = E->getVectorFactor(); + unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + if (!CommonVF && V1VF == E->Scalars.size()) + CommonVF = E->Scalars.size(); + if (CommonVF) + VF = CommonVF; + V2 = Constant::getNullValue( + FixedVectorType::get(E->Scalars.front()->getType(), VF)); + } + return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask, + Builder); + } + +public: + ShuffleCostEstimator(TargetTransformInfo &TTI, + ArrayRef<Value *> VectorizedVals, BoUpSLP &R, + SmallPtrSetImpl<Value *> &CheckedExtracts) + : TTI(TTI), VectorizedVals(VectorizedVals), R(R), + CheckedExtracts(CheckedExtracts) {} + Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask, + TTI::ShuffleKind ShuffleKind) { + if (Mask.empty()) + return nullptr; + Value *VecBase = nullptr; + ArrayRef<Value *> VL = E->Scalars; + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI->getNumberOfParts(VecTy); + unsigned VecNumParts = TTI.getNumberOfParts(VecTy); if (VecNumParts == VecTy->getNumElements()) - return; + return nullptr; DenseMap<Value *, int> ExtractVectorsTys; - SmallPtrSet<Value *, 4> CheckedExtracts; - for (auto *V : VL) { - if (isa<UndefValue>(V)) + for (auto [I, V] : enumerate(VL)) { + // Ignore non-extractelement scalars. + if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -6631,17 +7123,18 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // vectorized tree. // Also, avoid adjusting the cost for extractelements with multiple uses // in different graph entries. - const TreeEntry *VE = getTreeEntry(V); + const TreeEntry *VE = R.getTreeEntry(V); if (!CheckedExtracts.insert(V).second || - !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); + VecBase = EE->getVectorOperand(); std::optional<unsigned> EEIdx = getExtractIndex(EE); if (!EEIdx) continue; unsigned Idx = *EEIdx; - if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) { + if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { auto It = ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; It->getSecond() = std::min<int>(It->second, Idx); @@ -6654,18 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. - Cost -= - TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); + Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); // Add back the cost of s|zext which is subtracted separately. - Cost += TTI->getCastInstrCost( + Cost += TTI.getCastInstrCost( Ext->getOpcode(), Ext->getType(), EE->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } - Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6673,34 +7165,148 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, unsigned NumElts = VecTy->getNumElements(); if (Data.second % NumElts == 0) continue; - if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { + if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); + if (Idx % NumElts == 0) + continue; if (Idx + NumElts <= EENumElts) { - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, VecTy); } else { // Need to round up the subvector type vectorization factor to avoid a // crash in cost model functions. Make SubVT so that Idx + VF of SubVT // <= EENumElts. auto *SubVT = FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, SubVT); } } else { - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, std::nullopt, CostKind, 0, EEVTy); } } - }; + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + // Found the bunch of extractelement instructions that must be gathered + // into a vector and can be represented as a permutation elements in a + // single input vector or of 2 input vectors. + Cost += computeExtractCost(VL, Mask, ShuffleKind); + return VecBase; + } + void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign({E1, E2}); + } + void add(const TreeEntry *E1, ArrayRef<int> Mask) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, E1); + } + /// Adds another one input vector and the mask for the shuffling. + void add(Value *V1, ArrayRef<int> Mask) { + assert(CommonMask.empty() && InVectors.empty() && + "Expected empty input mask/vectors."); + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, V1); + } + Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { + Cost += getBuildVectorCost(VL, Root); + if (!Root) { + assert(InVectors.empty() && "Unexpected input vectors for buildvector."); + // FIXME: Need to find a way to avoid use of getNullValue here. + SmallVector<Constant *> Vals; + for (Value *V : VL) { + if (isa<UndefValue>(V)) { + Vals.push_back(cast<Constant>(V)); + continue; + } + Vals.push_back(Constant::getNullValue(V->getType())); + } + return ConstantVector::get(Vals); + } + return ConstantVector::getSplat( + ElementCount::getFixed(VL.size()), + Constant::getNullValue(VL.front()->getType())); + } + /// Finalize emission of the shuffles. + InstructionCost + finalize(ArrayRef<int> ExtMask, unsigned VF = 0, + function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { + IsFinalized = true; + if (Action) { + const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); + if (InVectors.size() == 2) { + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Cost += createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + assert(VF > 0 && + "Expected vector length for the final value before action."); + Value *V = Vec.dyn_cast<Value *>(); + if (!Vec.isNull() && !V) + V = Constant::getNullValue(FixedVectorType::get( + Vec.get<const TreeEntry *>()->Scalars.front()->getType(), + CommonMask.size())); + Action(V, CommonMask); + } + ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); + if (CommonMask.empty()) + return Cost; + int Limit = CommonMask.size() * 2; + if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(CommonMask)) + return Cost; + return Cost + + createShuffle(InVectors.front(), + InVectors.size() == 2 ? InVectors.back() : nullptr, + CommonMask); + } + + ~ShuffleCostEstimator() { + assert((IsFinalized || CommonMask.empty()) && + "Shuffle construction must be finalized."); + } +}; + +InstructionCost +BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, + SmallPtrSetImpl<Value *> &CheckedExtracts) { + ArrayRef<Value *> VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (auto *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + else if (auto *CI = dyn_cast<CmpInst>(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(VL[0])) + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + unsigned EntryVF = E->getVectorFactor(); + auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isa<InsertElementInst>(VL[0])) return InstructionCost::getInvalid(); + ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this, + CheckedExtracts); + unsigned VF = E->getVectorFactor(); + SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Build a mask out of the reorder indices and reorder scalars per this // mask. @@ -6709,195 +7315,104 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); SmallVector<int> Mask; + SmallVector<int> ExtractMask; + std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; SmallVector<const TreeEntry *> Entries; + Type *ScalarTy = GatheredScalars.front()->getType(); + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector<Value *> IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + + bool Resized = false; + if (Value *VecBase = Estimator.adjustExtracts( + E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) + if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely // not profitable (splats and small gather nodes.) - if (E->getOpcode() != Instruction::Load || E->isAltShuffle() || + if (ExtractShuffle || E->getOpcode() != Instruction::Load || + E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); if (GatherShuffle) { - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != UndefMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - } assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); - InstructionCost GatherCost = 0; - int Limit = Mask.size() * 2; - if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask)) { + if (*GatherShuffle == TTI::SK_PermuteSingleSrc && + Entries.front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *VL.front() << ".\n"); - if (NeedToShuffleReuses) - GatherCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - } else { - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n"); - // Detected that instead of gather we can emit a shuffle of single/two - // previously vectorized nodes. Add the cost of the permutation rather - // than gather. - ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); - } - if (!all_of(GatheredScalars, UndefValue::classof)) - GatherCost += getGatherCost(GatheredScalars); - return GatherCost; - } - if ((E->getOpcode() == Instruction::ExtractElement || - all_of(E->Scalars, - [](Value *V) { - return isa<ExtractElementInst, UndefValue>(V); - })) && - allSameType(VL)) { - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - SmallVector<int> Mask; - std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind = - isFixedVectorShuffle(VL, Mask); - if (ShuffleKind) { - // Found the bunch of extractelement instructions that must be gathered - // into a vector and can be represented as a permutation elements in a - // single input vector or of 2 input vectors. - InstructionCost Cost = - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost; - } - } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - assert(VecTy == FinalVecTy && - "No reused scalars expected for broadcast."); - const auto *It = - find_if(VL, [](Value *V) { return !isa<UndefValue>(V); }); - // If all values are undefs - consider cost free. - if (It == VL.end()) - return TTI::TCC_Free; - // Add broadcast for non-identity shuffle only. - bool NeedShuffle = - VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof); - InstructionCost InsertCost = - TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, - /*Index=*/0, PoisonValue::get(VecTy), *It); - return InsertCost + (NeedShuffle - ? TTI->getShuffleCost( - TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/std::nullopt, CostKind, - /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/VL[0]) - : TTI::TCC_Free); - } - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) - ReuseShuffleCost = TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef<Value *> Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector<Value *> PointerOps; - OrdersType CurrentOrder; - LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - *TLI, CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } + // Restore the mask for previous partially matched values. + for (auto [I, V] : enumerate(E->Scalars)) { + if (isa<PoisonValue>(V)) { + Mask[I] = PoisonMaskElem; + continue; } + if (Mask[I] == PoisonMaskElem) + Mask[I] = Entries.front()->findLaneForValue(V); } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; + Estimator.add(Entries.front(), Mask); + return Estimator.finalize(E->ReuseShuffleIndices); } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast<LoadInst>(V); - ScalarsCost += - TTI->getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LI = cast<LoadInst>(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += - TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - std::nullopt, CostKind, I, LoadTy); - } - return ReuseShuffleCost + GatherCost - ScalarsCost; + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != PoisonMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() + << " entries for bundle that starts with " + << *VL.front() << ".\n";); + if (Entries.size() == 1) + Estimator.add(Entries.front(), Mask); + else + Estimator.add(Entries.front(), Entries.back(), Mask); + if (all_of(GatheredScalars, PoisonValue ::classof)) + return Estimator.finalize(E->ReuseShuffleIndices); + return Estimator.finalize( + E->ReuseShuffleIndices, E->Scalars.size(), + [&](Value *&Vec, SmallVectorImpl<int> &Mask) { + Vec = Estimator.gather(GatheredScalars, + Constant::getNullValue(FixedVectorType::get( + GatheredScalars.front()->getType(), + GatheredScalars.size()))); + }); } - return ReuseShuffleCost + getGatherCost(VL); + if (!all_of(GatheredScalars, PoisonValue::classof)) { + auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size()); + bool SameGathers = VL.equals(Gathers); + Value *BV = Estimator.gather( + Gathers, SameGathers ? nullptr + : Constant::getNullValue(FixedVectorType::get( + GatheredScalars.front()->getType(), + GatheredScalars.size()))); + SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem); + std::iota(ReuseMask.begin(), ReuseMask.end(), 0); + Estimator.add(BV, ReuseMask); + } + if (ExtractShuffle) + Estimator.add(E, std::nullopt); + return Estimator.finalize(E->ReuseShuffleIndices); } InstructionCost CommonCost = 0; SmallVector<int> Mask; @@ -6945,48 +7460,89 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } InstructionCost VecCost = VectorCost(CommonCost); - LLVM_DEBUG( - dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost)); - // Disable warnings for `this` and `E` are unused. Required for - // `dumpTreeCosts`. - (void)this; - (void)E; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, + ScalarCost, "Calculated costs for Tree")); return VecCost - ScalarCost; }; // Calculate cost difference from vectorizing set of GEPs. // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { - InstructionCost CostSavings = 0; - for (Value *V : Ptrs) { - if (V == BasePtr) - continue; - auto *Ptr = dyn_cast<GetElementPtrInst>(V); - // GEPs may contain just addresses without instructions, considered free. - // GEPs with all constant indices also considered to have zero cost. - if (!Ptr || Ptr->hasAllConstantIndices()) - continue; - - // Here we differentiate two cases: when GEPs represent a regular - // vectorization tree node (and hence vectorized) and when the set is - // arguments of a set of loads or stores being vectorized. In the former - // case all the scalar GEPs will be removed as a result of vectorization. + InstructionCost ScalarCost = 0; + InstructionCost VecCost = 0; + // Here we differentiate two cases: (1) when Ptrs represent a regular + // vectorization tree node (as they are pointer arguments of scattered + // loads) or (2) when Ptrs are the arguments of loads or stores being + // vectorized as plane wide unit-stride load/store since all the + // loads/stores are known to be from/to adjacent locations. + assert(E->State == TreeEntry::Vectorize && + "Entry state expected to be Vectorize here."); + if (isa<LoadInst, StoreInst>(VL0)) { + // Case 2: estimate costs for pointer related costs when vectorizing to + // a wide load/store. + // Scalar cost is estimated as a set of pointers with known relationship + // between them. + // For vector code we will use BasePtr as argument for the wide load/store + // but we also need to account all the instructions which are going to + // stay in vectorized code due to uses outside of these scalar + // loads/stores. + ScalarCost = TTI->getPointersChainCost( + Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, + CostKind); + + SmallVector<const Value *> PtrsRetainedInVecCode; + for (Value *V : Ptrs) { + if (V == BasePtr) { + PtrsRetainedInVecCode.push_back(V); + continue; + } + auto *Ptr = dyn_cast<GetElementPtrInst>(V); + // For simplicity assume Ptr to stay in vectorized code if it's not a + // GEP instruction. We don't care since it's cost considered free. + // TODO: We should check for any uses outside of vectorizable tree + // rather than just single use. + if (!Ptr || !Ptr->hasOneUse()) + PtrsRetainedInVecCode.push_back(V); + } + + if (PtrsRetainedInVecCode.size() == Ptrs.size()) { + // If all pointers stay in vectorized code then we don't have + // any savings on that. + LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost, + "Calculated GEPs cost for Tree")); + return InstructionCost{TTI::TCC_Free}; + } + VecCost = TTI->getPointersChainCost( + PtrsRetainedInVecCode, BasePtr, + TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind); + } else { + // Case 1: Ptrs are the arguments of loads that we are going to transform + // into masked gather load intrinsic. + // All the scalar GEPs will be removed as a result of vectorization. // For any external uses of some lanes extract element instructions will - // be generated (which cost is estimated separately). For the latter case - // since the set of GEPs itself is not vectorized those used more than - // once will remain staying in vectorized code as well. So we should not - // count them as savings. - if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0)) - continue; - - // TODO: it is target dependent, so need to implement and then use a TTI - // interface. - CostSavings += TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); - } - LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n"; - E->dump()); - LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n"); - return InstructionCost() - CostSavings; + // be generated (which cost is estimated separately). + TTI::PointersChainInfo PtrsInfo = + all_of(Ptrs, + [](const Value *V) { + auto *Ptr = dyn_cast<GetElementPtrInst>(V); + return Ptr && !Ptr->hasAllConstantIndices(); + }) + ? TTI::PointersChainInfo::getUnknownStride() + : TTI::PointersChainInfo::getKnownStride(); + + ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, + CostKind); + if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) { + SmallVector<const Value *> Indices(BaseGEP->indices()); + VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(), + BaseGEP->getPointerOperand(), Indices, VecTy, + CostKind); + } + } + + LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, + "Calculated GEPs cost for Tree")); + + return VecCost - ScalarCost; }; switch (ShuffleOrOp) { @@ -7062,7 +7618,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); - SmallVector<int> InsertMask(NumElts, UndefMaskElem); + SmallVector<int> InsertMask(NumElts, PoisonMaskElem); unsigned OffsetBeg = *getInsertIndex(VL.front()); unsigned OffsetEnd = OffsetBeg; InsertMask[OffsetBeg] = 0; @@ -7099,13 +7655,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, SmallVector<int> Mask; if (!E->ReorderIndices.empty()) { inversePermutation(E->ReorderIndices, Mask); - Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); + Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem); } else { - Mask.assign(VecSz, UndefMaskElem); + Mask.assign(VecSz, PoisonMaskElem); std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); } bool IsIdentity = true; - SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem); + SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem); Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); @@ -7148,14 +7704,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InsertVecTy); } else { for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) - Mask[I] = InMask.test(I) ? UndefMaskElem : I; + Mask[I] = InMask.test(I) ? PoisonMaskElem : I; for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; I <= End; ++I) - if (Mask[I] != UndefMaskElem) + if (Mask[I] != PoisonMaskElem) Mask[I] = I + VecSz; for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) Mask[I] = - ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I; + ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } } @@ -7422,11 +7978,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { - VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), + auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); + VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind, VL0); VecCost += TTI->getCmpSelInstrCost( - E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + E->getOpcode(), VecTy, MaskTy, cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, E->getAltOp()); } else { @@ -7615,7 +8171,7 @@ InstructionCost BoUpSLP::getSpillCost() const { unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); InstructionCost Cost = 0; - SmallPtrSet<Instruction*, 4> LiveValues; + SmallPtrSet<Instruction *, 4> LiveValues; Instruction *PrevInst = nullptr; // The entries in VectorizableTree are not necessarily ordered by their @@ -7626,6 +8182,8 @@ InstructionCost BoUpSLP::getSpillCost() const { // are grouped together. Using dominance ensures a deterministic order. SmallVector<Instruction *, 16> OrderedScalars; for (const auto &TEPtr : VectorizableTree) { + if (TEPtr->State != TreeEntry::Vectorize) + continue; Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); if (!Inst) continue; @@ -7639,7 +8197,7 @@ InstructionCost BoUpSLP::getSpillCost() const { assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"); if (NodeA != NodeB) - return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); + return NodeA->getDFSNumIn() > NodeB->getDFSNumIn(); return B->comesBefore(A); }); @@ -7698,7 +8256,7 @@ InstructionCost BoUpSLP::getSpillCost() const { }; // Debug information does not impact spill cost. - if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && + if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && &*PrevInstIt != PrevInst) NumCalls++; @@ -7706,7 +8264,7 @@ InstructionCost BoUpSLP::getSpillCost() const { } if (NumCalls) { - SmallVector<Type*, 4> V; + SmallVector<Type *, 4> V; for (auto *II : LiveValues) { auto *ScalarTy = II->getType(); if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) @@ -7797,8 +8355,8 @@ static T *performExtractsShuffleAction( ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask); for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { - if (Mask[Idx] == UndefMaskElem) - Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx; + if (Mask[Idx] == PoisonMaskElem) + Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx; else Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; } @@ -7827,8 +8385,8 @@ static T *performExtractsShuffleAction( // can shuffle them directly. ArrayRef<int> SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { - if (SecMask[I] != UndefMaskElem) { - assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + if (SecMask[I] != PoisonMaskElem) { + assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); Mask[I] = SecMask[I] + Vec1VF; } } @@ -7841,12 +8399,12 @@ static T *performExtractsShuffleAction( ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef<int> SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { - if (Mask[I] != UndefMaskElem) { - assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars."); + if (Mask[I] != PoisonMaskElem) { + assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars."); if (Res1.second) Mask[I] = I; - } else if (SecMask[I] != UndefMaskElem) { - assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + } else if (SecMask[I] != PoisonMaskElem) { + assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); Mask[I] = (Res2.second ? I : SecMask[I]) + VF; } } @@ -7863,11 +8421,11 @@ static T *performExtractsShuffleAction( ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef<int> SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { - if (SecMask[I] != UndefMaskElem) { - assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) && + if (SecMask[I] != PoisonMaskElem) { + assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) && "Multiple uses of scalars."); Mask[I] = (Res.second ? I : SecMask[I]) + VF; - } else if (Mask[I] != UndefMaskElem) { + } else if (Mask[I] != PoisonMaskElem) { Mask[I] = I; } } @@ -7877,12 +8435,23 @@ static T *performExtractsShuffleAction( } InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { + // Build a map for gathered scalars to the nodes where they are used. + ValueToGatherNodes.clear(); + for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { + if (EntryPtr->State != TreeEntry::NeedToGather) + continue; + for (Value *V : EntryPtr->Scalars) + if (!isConstant(V)) + ValueToGatherNodes.try_emplace(V).first->getSecond().insert( + EntryPtr.get()); + } InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); + SmallPtrSet<Value *, 4> CheckedExtracts; for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I]; if (TE.State == TreeEntry::NeedToGather) { @@ -7898,7 +8467,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { } } - InstructionCost C = getEntryCost(&TE, VectorizedVals); + InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] @@ -7951,7 +8520,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { (void)ShuffleMasks.emplace_back(); SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE]; if (Mask.empty()) - Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask.assign(FTy->getNumElements(), PoisonMaskElem); // Find the insertvector, vectorized in tree, if any. Value *Base = VU; while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { @@ -7965,7 +8534,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { do { IEBase = cast<InsertElementInst>(Base); int Idx = *getInsertIndex(IEBase); - assert(Mask[Idx] == UndefMaskElem && + assert(Mask[Idx] == PoisonMaskElem && "InsertElementInstruction used already."); Mask[Idx] = Idx; Base = IEBase->getOperand(0); @@ -7985,7 +8554,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { int InIdx = *InsertIdx; SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE]; if (Mask.empty()) - Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask.assign(FTy->getNumElements(), PoisonMaskElem); Mask[InIdx] = EU.Lane; DemandedElts[VecId].setBit(InIdx); continue; @@ -8024,7 +8593,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { (all_of(Mask, [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) && !ShuffleVectorInst::isIdentityMask(Mask)))) { - SmallVector<int> OrigMask(VecVF, UndefMaskElem); + SmallVector<int> OrigMask(VecVF, PoisonMaskElem); std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), OrigMask.begin()); C = TTI->getShuffleCost( @@ -8110,17 +8679,23 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, // No need to check for the topmost gather node. if (TE == VectorizableTree.front().get()) return std::nullopt; - Mask.assign(VL.size(), UndefMaskElem); + Mask.assign(VL.size(), PoisonMaskElem); assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. Instruction &UserInst = getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); - auto *PHI = dyn_cast<PHINode>(&UserInst); - auto *NodeUI = DT->getNode( - PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx) - : UserInst.getParent()); + BasicBlock *ParentBB = nullptr; + // Main node of PHI entries keeps the correct order of operands/incoming + // blocks. + if (auto *PHI = + dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) { + ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx); + } else { + ParentBB = UserInst.getParent(); + } + auto *NodeUI = DT->getNode(ParentBB); assert(NodeUI && "Should only process reachable instructions"); SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); auto CheckOrdering = [&](Instruction *LastEI) { @@ -8147,45 +8722,6 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, return false; return true; }; - // Build a lists of values to tree entries. - DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; - for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { - if (EntryPtr.get() == TE) - continue; - if (EntryPtr->State != TreeEntry::NeedToGather) - continue; - if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) { - return GatheredScalars.contains(V); - })) - continue; - assert(EntryPtr->UserTreeIndices.size() == 1 && - "Expected only single user of the gather node."); - Instruction &EntryUserInst = - getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE); - if (&UserInst == &EntryUserInst) { - // If 2 gathers are operands of the same entry, compare operands indices, - // use the earlier one as the base. - if (TE->UserTreeIndices.front().UserTE == - EntryPtr->UserTreeIndices.front().UserTE && - TE->UserTreeIndices.front().EdgeIdx < - EntryPtr->UserTreeIndices.front().EdgeIdx) - continue; - } - // Check if the user node of the TE comes after user node of EntryPtr, - // otherwise EntryPtr depends on TE. - auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst); - auto *EntryI = - EntryPHI - ? EntryPHI - ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx) - ->getTerminator() - : &EntryUserInst; - if (!CheckOrdering(EntryI)) - continue; - for (Value *V : EntryPtr->Scalars) - if (!isConstant(V)) - ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); - } // Find all tree entries used by the gathered values. If no common entries // found - not a shuffle. // Here we build a set of tree nodes for each gathered value and trying to @@ -8195,16 +8731,58 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, // have a permutation of 2 input vectors. SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; DenseMap<Value *, int> UsedValuesEntry; - for (Value *V : TE->Scalars) { + for (Value *V : VL) { if (isConstant(V)) continue; // Build a list of tree entries where V is used. SmallPtrSet<const TreeEntry *, 4> VToTEs; - auto It = ValueToTEs.find(V); - if (It != ValueToTEs.end()) - VToTEs = It->second; - if (const TreeEntry *VTE = getTreeEntry(V)) + for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { + if (TEPtr == TE) + continue; + assert(any_of(TEPtr->Scalars, + [&](Value *V) { return GatheredScalars.contains(V); }) && + "Must contain at least single gathered value."); + assert(TEPtr->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + PHINode *EntryPHI = + dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp()); + Instruction *EntryUserInst = + EntryPHI ? nullptr + : &getLastInstructionInBundle( + TEPtr->UserTreeIndices.front().UserTE); + if (&UserInst == EntryUserInst) { + assert(!EntryPHI && "Unexpected phi node entry."); + // If 2 gathers are operands of the same entry, compare operands + // indices, use the earlier one as the base. + if (TE->UserTreeIndices.front().UserTE == + TEPtr->UserTreeIndices.front().UserTE && + TE->UserTreeIndices.front().EdgeIdx < + TEPtr->UserTreeIndices.front().EdgeIdx) + continue; + } + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + auto *EntryI = + EntryPHI + ? EntryPHI + ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx) + ->getTerminator() + : EntryUserInst; + if ((ParentBB != EntryI->getParent() || + TE->UserTreeIndices.front().EdgeIdx < + TEPtr->UserTreeIndices.front().EdgeIdx || + TE->UserTreeIndices.front().UserTE != + TEPtr->UserTreeIndices.front().UserTE) && + !CheckOrdering(EntryI)) + continue; + VToTEs.insert(TEPtr); + } + if (const TreeEntry *VTE = getTreeEntry(V)) { + Instruction &EntryUserInst = getLastInstructionInBundle(VTE); + if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst)) + continue; VToTEs.insert(VTE); + } if (VToTEs.empty()) continue; if (UsedTEs.empty()) { @@ -8260,13 +8838,13 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); }); - if (It != FirstEntries.end()) { + if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) { Entries.push_back(*It); std::iota(Mask.begin(), Mask.end(), 0); // Clear undef scalars. for (int I = 0, Sz = VL.size(); I < Sz; ++I) - if (isa<PoisonValue>(TE->Scalars[I])) - Mask[I] = UndefMaskElem; + if (isa<PoisonValue>(VL[I])) + Mask[I] = PoisonMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } // No perfect match, just shuffle, so choose the first tree node from the @@ -8302,10 +8880,18 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, break; } } - // No 2 source vectors with the same vector factor - give up and do regular - // gather. - if (Entries.empty()) - return std::nullopt; + // No 2 source vectors with the same vector factor - just choose 2 with max + // index. + if (Entries.empty()) { + Entries.push_back( + *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), + [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + })); + Entries.push_back(SecondEntries.front()); + VF = std::max(Entries.front()->getVectorFactor(), + Entries.back()->getVectorFactor()); + } } bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof); @@ -8427,19 +9013,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, return std::nullopt; } -InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost Cost = - TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false, CostKind); - if (NeedToShuffle) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); - return Cost; -} - -InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { +InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, + bool ForPoisonSrc) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -8451,20 +9026,36 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // shuffle candidates. APInt ShuffledElements = APInt::getZero(VL.size()); DenseSet<Value *> UniqueElements; - // Iterate in reverse order to consider insert elements with the high cost. - for (unsigned I = VL.size(); I > 0; --I) { - unsigned Idx = I - 1; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Cost; + auto EstimateInsertCost = [&](unsigned I, Value *V) { + if (!ForPoisonSrc) + Cost += + TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, + I, Constant::getNullValue(VecTy), V); + }; + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Value *V = VL[I]; // No need to shuffle duplicates for constants. - if (isConstant(VL[Idx])) { - ShuffledElements.setBit(Idx); + if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) { + ShuffledElements.setBit(I); continue; } - if (!UniqueElements.insert(VL[Idx]).second) { + if (!UniqueElements.insert(V).second) { DuplicateNonConst = true; - ShuffledElements.setBit(Idx); + ShuffledElements.setBit(I); + continue; } + EstimateInsertCost(I, V); } - return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); + if (ForPoisonSrc) + Cost = + TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true, + /*Extract*/ false, CostKind); + if (DuplicateNonConst) + Cost += + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + return Cost; } // Perform operand reordering on the instructions in VL and return the reordered @@ -8483,6 +9074,9 @@ void BoUpSLP::reorderInputsAccordingToOpcode( } Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { + auto &Res = EntryToLastInstruction.FindAndConstruct(E); + if (Res.second) + return *Res.second; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with // constant indeces). @@ -8497,7 +9091,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { isVectorLikeInstWithConstOps(I); })); - auto &&FindLastInst = [E, Front, this, &BB]() { + auto FindLastInst = [&]() { Instruction *LastInst = Front; for (Value *V : E->Scalars) { auto *I = dyn_cast<Instruction>(V); @@ -8508,9 +9102,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { LastInst = I; continue; } - assert(isVectorLikeInstWithConstOps(LastInst) && - isVectorLikeInstWithConstOps(I) && - "Expected vector-like insts only."); + assert(((E->getOpcode() == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(I)) || + (isVectorLikeInstWithConstOps(LastInst) && + isVectorLikeInstWithConstOps(I))) && + "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(LastInst->getParent())) { LastInst = I; continue; @@ -8531,7 +9127,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return LastInst; }; - auto &&FindFirstInst = [E, Front, this]() { + auto FindFirstInst = [&]() { Instruction *FirstInst = Front; for (Value *V : E->Scalars) { auto *I = dyn_cast<Instruction>(V); @@ -8542,9 +9138,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { FirstInst = I; continue; } - assert(isVectorLikeInstWithConstOps(FirstInst) && - isVectorLikeInstWithConstOps(I) && - "Expected vector-like insts only."); + assert(((E->getOpcode() == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(I)) || + (isVectorLikeInstWithConstOps(FirstInst) && + isVectorLikeInstWithConstOps(I))) && + "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(FirstInst->getParent())) { FirstInst = I; continue; @@ -8566,22 +9164,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Set the insert point to the beginning of the basic block if the entry // should not be scheduled. - if (E->State != TreeEntry::NeedToGather && - (doesNotNeedToSchedule(E->Scalars) || + if (doesNotNeedToSchedule(E->Scalars) || + (E->State != TreeEntry::NeedToGather && all_of(E->Scalars, isVectorLikeInstWithConstOps))) { - Instruction *InsertInst; - if (all_of(E->Scalars, [](Value *V) { + if ((E->getOpcode() == Instruction::GetElementPtr && + any_of(E->Scalars, + [](Value *V) { + return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); + })) || + all_of(E->Scalars, [](Value *V) { return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V); })) - InsertInst = FindLastInst(); + Res.second = FindLastInst(); else - InsertInst = FindFirstInst(); - return *InsertInst; + Res.second = FindFirstInst(); + return *Res.second; } - // The last instruction in the bundle in program order. - Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been // scheduled, and the last instruction is VL.back(). So we start with // VL.back() and iterate over schedule data until we reach the end of the @@ -8594,7 +9193,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + Res.second = Bundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -8615,15 +9214,15 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // not ideal. However, this should be exceedingly rare since it requires that // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). - if (!LastInst) - LastInst = FindLastInst(); - assert(LastInst && "Failed to find last instruction in bundle"); - return *LastInst; + if (!Res.second) + Res.second = FindLastInst(); + assert(Res.second && "Failed to find last instruction in bundle"); + return *Res.second; } void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { auto *Front = E->getMainOp(); - Instruction *LastInst = EntryToLastInstruction.lookup(E); + Instruction *LastInst = &getLastInstructionInBundle(E); assert(LastInst && "Failed to find last instruction in bundle"); // If the instruction is PHI, set the insert point after all the PHIs. bool IsPHI = isa<PHINode>(LastInst); @@ -8641,7 +9240,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::gather(ArrayRef<Value *> VL) { +Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) { // List of instructions/lanes from current block and/or the blocks which are // part of the current loop. These instructions will be inserted at the end to // make it possible to optimize loops and hoist invariant instructions out of @@ -8658,7 +9257,8 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { for (int I = 0, E = VL.size(); I < E; ++I) { if (auto *Inst = dyn_cast<Instruction>(VL[I])) if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || - getTreeEntry(Inst) || (L && (L->contains(Inst)))) && + getTreeEntry(Inst) || + (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && PostponedIndices.insert(I).second) PostponedInsts.emplace_back(Inst, I); } @@ -8681,7 +9281,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { Value *Val0 = isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); - Value *Vec = PoisonValue::get(VecTy); + Value *Vec = Root ? Root : PoisonValue::get(VecTy); SmallVector<int> NonConsts; // Insert constant values at first. for (int I = 0, E = VL.size(); I < E; ++I) { @@ -8691,6 +9291,18 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { NonConsts.push_back(I); continue; } + if (Root) { + if (!isa<UndefValue>(VL[I])) { + NonConsts.push_back(I); + continue; + } + if (isa<PoisonValue>(VL[I])) + continue; + if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) { + if (SV->getMaskValue(I) == PoisonMaskElem) + continue; + } + } Vec = CreateInsertElement(Vec, VL[I], I); } // Insert non-constant values. @@ -8789,6 +9401,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { } return Vec; } + Value *createIdentity(Value *V) { return V; } + Value *createPoison(Type *Ty, unsigned VF) { + return PoisonValue::get(FixedVectorType::get(Ty, VF)); + } /// Resizes 2 input vector to match the sizes, if the they are not equal /// yet. The smallest vector is resized to the size of the larger vector. void resizeToMatch(Value *&V1, Value *&V2) { @@ -8798,7 +9414,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); int VF = std::max(V1VF, V2VF); int MinVF = std::min(V1VF, V2VF); - SmallVector<int> IdentityMask(VF, UndefMaskElem); + SmallVector<int> IdentityMask(VF, PoisonMaskElem); std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), 0); Value *&Op = MinVF == V1VF ? V1 : V2; @@ -8821,7 +9437,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { assert(V1 && "Expected at least one vector value."); ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, R.CSEBlocks); - return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); + return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask, + ShuffleBuilder); } /// Transforms mask \p CommonMask per given \p Mask to make proper set after @@ -8829,7 +9446,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, ArrayRef<int> Mask) { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != UndefMaskElem) + if (Mask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; } @@ -8837,6 +9454,39 @@ public: ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R) : Builder(Builder), R(R) {} + /// Adjusts extractelements after reusing them. + Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) { + Value *VecBase = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + int Idx = Mask[I]; + if (Idx == PoisonMaskElem) + continue; + auto *EI = cast<ExtractElementInst>(E->Scalars[I]); + VecBase = EI->getVectorOperand(); + // If the only one use is vectorized - can delete the extractelement + // itself. + if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { + return !R.ScalarToTreeEntry.count(U); + })) + continue; + R.eraseInstruction(EI); + } + return VecBase; + } + /// Checks if the specified entry \p E needs to be delayed because of its + /// dependency nodes. + Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) { + // No need to delay emission if all deps are ready. + if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) + return nullptr; + // Postpone gather emission, will be emitted after the end of the + // process to keep correct order. + auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), + E->getVectorFactor()); + return Builder.CreateAlignedLoad( + VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())), + MaybeAlign()); + } /// Adds 2 input vectors and the mask for their shuffling. void add(Value *V1, Value *V2, ArrayRef<int> Mask) { assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); @@ -8849,15 +9499,15 @@ public: Value *Vec = InVectors.front(); if (InVectors.size() == 2) { Vec = createShuffle(Vec, InVectors.back(), CommonMask); - transformMaskAfterShuffle(CommonMask, Mask); + transformMaskAfterShuffle(CommonMask, CommonMask); } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Mask.size()) { Vec = createShuffle(Vec, nullptr, CommonMask); - transformMaskAfterShuffle(CommonMask, Mask); + transformMaskAfterShuffle(CommonMask, CommonMask); } V1 = createShuffle(V1, V2, Mask); for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != UndefMaskElem) + if (Mask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx + Sz; InVectors.front() = Vec; if (InVectors.size() == 2) @@ -8870,7 +9520,7 @@ public: if (InVectors.empty()) { if (!isa<FixedVectorType>(V1->getType())) { V1 = createShuffle(V1, nullptr, CommonMask); - CommonMask.assign(Mask.size(), UndefMaskElem); + CommonMask.assign(Mask.size(), PoisonMaskElem); transformMaskAfterShuffle(CommonMask, Mask); } InVectors.push_back(V1); @@ -8892,7 +9542,7 @@ public: transformMaskAfterShuffle(CommonMask, CommonMask); } for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem) + if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) CommonMask[Idx] = V->getType() != V1->getType() ? Idx + Sz @@ -8910,7 +9560,7 @@ public: // Check if second vector is required if the used elements are already // used from the first one. for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) { + if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) { InVectors.push_back(V1); break; } @@ -8919,7 +9569,7 @@ public: if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) VF = FTy->getNumElements(); for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) + if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); } /// Adds another one input vector and the mask for the shuffling. @@ -8928,17 +9578,46 @@ public: inversePermutation(Order, NewMask); add(V1, NewMask); } + Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { + return R.gather(VL, Root); + } + Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } /// Finalize emission of the shuffles. + /// \param Action the action (if any) to be performed before final applying of + /// the \p ExtMask mask. Value * - finalize(ArrayRef<int> ExtMask = std::nullopt) { + finalize(ArrayRef<int> ExtMask, unsigned VF = 0, + function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { IsFinalized = true; + if (Action) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + assert(VF > 0 && + "Expected vector length for the final value before action."); + unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); + if (VecVF < VF) { + SmallVector<int> ResizeMask(VF, PoisonMaskElem); + std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0); + Vec = createShuffle(Vec, nullptr, ResizeMask); + } + Action(Vec, CommonMask); + InVectors.front() = Vec; + } if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); } else { - SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); + SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { - if (ExtMask[I] == UndefMaskElem) + if (ExtMask[I] == PoisonMaskElem) continue; NewMask[I] = CommonMask[ExtMask[I]]; } @@ -9009,18 +9688,18 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { // ... (use %2) // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} // br %block - SmallVector<int> UniqueIdxs(VF, UndefMaskElem); + SmallVector<int> UniqueIdxs(VF, PoisonMaskElem); SmallSet<int, 4> UsedIdxs; int Pos = 0; for (int Idx : VE->ReuseShuffleIndices) { - if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem && + if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem && UsedIdxs.insert(Idx).second) UniqueIdxs[Idx] = Pos; ++Pos; } assert(VF >= UsedIdxs.size() && "Expected vectorization factor " "less than original vector size."); - UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); + UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem); V = FinalShuffle(V, UniqueIdxs); } else { assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && @@ -9031,6 +9710,21 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { V = FinalShuffle(V, UniformMask); } } + // Need to update the operand gather node, if actually the operand is not a + // vectorized node, but the buildvector/gather node, which matches one of + // the vectorized nodes. + if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == E && EI.EdgeIdx == NodeIdx; + }) == VE->UserTreeIndices.end()) { + auto *It = find_if( + VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + return TE->State == TreeEntry::NeedToGather && + TE->UserTreeIndices.front().UserTE == E && + TE->UserTreeIndices.front().EdgeIdx == NodeIdx; + }); + assert(It != VectorizableTree.end() && "Expected gather node operand."); + (*It)->VectorizedValue = V; + } return V; } } @@ -9049,108 +9743,370 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { IRBuilder<>::InsertPointGuard Guard(Builder); if (E->getOpcode() != Instruction::InsertElement && E->getOpcode() != Instruction::PHI) { - Instruction *LastInst = EntryToLastInstruction.lookup(E); + Instruction *LastInst = &getLastInstructionInBundle(E); assert(LastInst && "Failed to find last instruction in bundle"); Builder.SetInsertPoint(LastInst); } return vectorizeTree(I->get()); } -Value *BoUpSLP::createBuildVector(const TreeEntry *E) { +template <typename BVTy, typename ResTy, typename... Args> +ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); - SmallVector<Value *> Gathered( - VF, PoisonValue::get(E->Scalars.front()->getType())); bool NeedFreeze = false; - SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the redorder indices and reorder scalars per this mask. + SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. SmallVector<int> ReorderMask; inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) - reorderScalars(VL, ReorderMask); - SmallVector<int> ReuseMask(VF, UndefMaskElem); - if (!allConstant(VL)) { + reorderScalars(GatheredScalars, ReorderMask); + auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) { + if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { + return isa<UndefValue>(V) && !isa<PoisonValue>(V); + })) + return false; + TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; + unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; + if (UserTE->getNumOperands() != 2) + return false; + auto *It = + find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) { + return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { + return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; + }) != TE->UserTreeIndices.end(); + }); + if (It == VectorizableTree.end()) + return false; + unsigned I = + *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); + int Sz = Mask.size(); + if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) && + ShuffleVectorInst::isIdentityMask(Mask)) + std::iota(Mask.begin(), Mask.end(), 0); + else + std::fill(Mask.begin(), Mask.end(), I); + return true; + }; + BVTy ShuffleBuilder(Params...); + ResTy Res = ResTy(); + SmallVector<int> Mask; + SmallVector<int> ExtractMask; + std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; + std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; + SmallVector<const TreeEntry *> Entries; + Type *ScalarTy = GatheredScalars.front()->getType(); + if (!all_of(GatheredScalars, UndefValue::classof)) { + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector<Value *> IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + bool Resized = false; + if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) + if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Gather extracts after we check for full matched gathers only. + if (ExtractShuffle || E->getOpcode() != Instruction::Load || + E->isAltShuffle() || + all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || + isSplat(E->Scalars) || + (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + } + if (GatherShuffle) { + if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { + // Delay emission of gathers which are not ready yet. + PostponedGathers.insert(E); + // Postpone gather emission, will be emitted after the end of the + // process to keep correct order. + return Delayed; + } + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + if (*GatherShuffle == TTI::SK_PermuteSingleSrc && + Entries.front()->isSame(E->Scalars)) { + // Perfect match in the graph, will reuse the previously vectorized + // node. Cost is 0. + LLVM_DEBUG( + dbgs() + << "SLP: perfect diamond match for gather bundle that starts with " + << *E->Scalars.front() << ".\n"); + // Restore the mask for previous partially matched values. + if (Entries.front()->ReorderIndices.empty() && + ((Entries.front()->ReuseShuffleIndices.empty() && + E->Scalars.size() == Entries.front()->Scalars.size()) || + (E->Scalars.size() == + Entries.front()->ReuseShuffleIndices.size()))) { + std::iota(Mask.begin(), Mask.end(), 0); + } else { + for (auto [I, V] : enumerate(E->Scalars)) { + if (isa<PoisonValue>(V)) { + Mask[I] = PoisonMaskElem; + continue; + } + Mask[I] = Entries.front()->findLaneForValue(V); + } + } + ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); + Res = ShuffleBuilder.finalize(E->getCommonMask()); + return Res; + } + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != PoisonMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + } + } + auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars, + SmallVectorImpl<int> &ReuseMask, + bool IsRootPoison) { // For splats with can emit broadcasts instead of gathers, so try to find // such sequences. - bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back()); + bool IsSplat = IsRootPoison && isSplat(Scalars) && + (Scalars.size() > 2 || Scalars.front() == Scalars.back()); + Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy)); SmallVector<int> UndefPos; DenseMap<Value *, unsigned> UniquePositions; // Gather unique non-const values and all constant values. // For repeated values, just shuffle them. - for (auto [I, V] : enumerate(VL)) { + int NumNonConsts = 0; + int SinglePos = 0; + for (auto [I, V] : enumerate(Scalars)) { if (isa<UndefValue>(V)) { if (!isa<PoisonValue>(V)) { - Gathered[I] = V; ReuseMask[I] = I; UndefPos.push_back(I); } continue; } if (isConstant(V)) { - Gathered[I] = V; ReuseMask[I] = I; continue; } + ++NumNonConsts; + SinglePos = I; + Value *OrigV = V; + Scalars[I] = PoisonValue::get(ScalarTy); if (IsSplat) { - Gathered.front() = V; + Scalars.front() = OrigV; ReuseMask[I] = 0; } else { - const auto Res = UniquePositions.try_emplace(V, I); - Gathered[Res.first->second] = V; + const auto Res = UniquePositions.try_emplace(OrigV, I); + Scalars[Res.first->second] = OrigV; ReuseMask[I] = Res.first->second; } } - if (!UndefPos.empty() && IsSplat) { + if (NumNonConsts == 1) { + // Restore single insert element. + if (IsSplat) { + ReuseMask.assign(VF, PoisonMaskElem); + std::swap(Scalars.front(), Scalars[SinglePos]); + if (!UndefPos.empty() && UndefPos.front() == 0) + Scalars.front() = UndefValue::get(ScalarTy); + } + ReuseMask[SinglePos] = SinglePos; + } else if (!UndefPos.empty() && IsSplat) { // For undef values, try to replace them with the simple broadcast. // We can do it if the broadcasted value is guaranteed to be // non-poisonous, or by freezing the incoming scalar value first. - auto *It = find_if(Gathered, [this, E](Value *V) { + auto *It = find_if(Scalars, [this, E](Value *V) { return !isa<UndefValue>(V) && (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || - any_of(V->uses(), [E](const Use &U) { - // Check if the value already used in the same operation in - // one of the nodes already. - return E->UserTreeIndices.size() == 1 && - is_contained( - E->UserTreeIndices.front().UserTE->Scalars, - U.getUser()) && - E->UserTreeIndices.front().EdgeIdx != U.getOperandNo(); - })); + (E->UserTreeIndices.size() == 1 && + any_of(V->uses(), [E](const Use &U) { + // Check if the value already used in the same operation in + // one of the nodes already. + return E->UserTreeIndices.front().EdgeIdx != + U.getOperandNo() && + is_contained( + E->UserTreeIndices.front().UserTE->Scalars, + U.getUser()); + }))); }); - if (It != Gathered.end()) { + if (It != Scalars.end()) { // Replace undefs by the non-poisoned scalars and emit broadcast. - int Pos = std::distance(Gathered.begin(), It); + int Pos = std::distance(Scalars.begin(), It); for_each(UndefPos, [&](int I) { // Set the undef position to the non-poisoned scalar. ReuseMask[I] = Pos; - // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already. + // Replace the undef by the poison, in the mask it is replaced by + // non-poisoned scalar already. if (I != Pos) - Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + Scalars[I] = PoisonValue::get(ScalarTy); }); } else { // Replace undefs by the poisons, emit broadcast and then emit // freeze. for_each(UndefPos, [&](int I) { - ReuseMask[I] = UndefMaskElem; - if (isa<UndefValue>(Gathered[I])) - Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + ReuseMask[I] = PoisonMaskElem; + if (isa<UndefValue>(Scalars[I])) + Scalars[I] = PoisonValue::get(ScalarTy); }); NeedFreeze = true; } } + }; + if (ExtractShuffle || GatherShuffle) { + bool IsNonPoisoned = true; + bool IsUsedInExpr = false; + Value *Vec1 = nullptr; + if (ExtractShuffle) { + // Gather of extractelements can be represented as just a shuffle of + // a single/two vectors the scalars are extracted from. + // Find input vectors. + Value *Vec2 = nullptr; + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == PoisonMaskElem || + (!Mask.empty() && Mask[I] != PoisonMaskElem)) { + ExtractMask[I] = PoisonMaskElem; + continue; + } + if (isa<UndefValue>(E->Scalars[I])) + continue; + auto *EI = cast<ExtractElementInst>(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) { + IsNonPoisoned &= + isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); + ShuffleBuilder.add(Vec1, Vec2, ExtractMask); + } else if (Vec1) { + IsUsedInExpr = FindReusedSplat(ExtractMask); + ShuffleBuilder.add(Vec1, ExtractMask); + IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); + } else { + ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( + ScalarTy, GatheredScalars.size())), + ExtractMask); + } + } + if (GatherShuffle) { + if (Entries.size() == 1) { + IsUsedInExpr = FindReusedSplat(Mask); + ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); + } else { + ShuffleBuilder.add(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && + isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + } + } + // Try to figure out best way to combine values: build a shuffle and insert + // elements or just build several shuffles. + // Insert non-constant scalars. + SmallVector<Value *> NonConstants(GatheredScalars); + int EMSz = ExtractMask.size(); + int MSz = Mask.size(); + // Try to build constant vector and shuffle with it only if currently we + // have a single permutation and more than 1 scalar constants. + bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsIdentityShuffle = + (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc && + none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && + ShuffleVectorInst::isIdentityMask(ExtractMask)) || + (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc && + none_of(Mask, [&](int I) { return I >= MSz; }) && + ShuffleVectorInst::isIdentityMask(Mask)); + bool EnoughConstsForShuffle = + IsSingleShuffle && + (none_of(GatheredScalars, + [](Value *V) { + return isa<UndefValue>(V) && !isa<PoisonValue>(V); + }) || + any_of(GatheredScalars, + [](Value *V) { + return isa<Constant>(V) && !isa<UndefValue>(V); + })) && + (!IsIdentityShuffle || + (GatheredScalars.size() == 2 && + any_of(GatheredScalars, + [](Value *V) { return !isa<UndefValue>(V); })) || + count_if(GatheredScalars, [](Value *V) { + return isa<Constant>(V) && !isa<PoisonValue>(V); + }) > 1); + // NonConstants array contains just non-constant values, GatheredScalars + // contains only constant to build final vector and then shuffle. + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I])) + NonConstants[I] = PoisonValue::get(ScalarTy); + else + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + // Generate constants for final shuffle and build a mask for them. + if (!all_of(GatheredScalars, PoisonValue::classof)) { + SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); + TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); + Value *BV = ShuffleBuilder.gather(GatheredScalars); + ShuffleBuilder.add(BV, BVMask); + } + if (all_of(NonConstants, [=](Value *V) { + return isa<PoisonValue>(V) || + (IsSingleShuffle && ((IsIdentityShuffle && + IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V)); + })) + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + else + Res = ShuffleBuilder.finalize( + E->ReuseShuffleIndices, E->Scalars.size(), + [&](Value *&Vec, SmallVectorImpl<int> &Mask) { + TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); + Vec = ShuffleBuilder.gather(NonConstants, Vec); + }); + } else if (!allConstant(GatheredScalars)) { + // Gather unique scalars and all constants. + SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); + TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); + Value *BV = ShuffleBuilder.gather(GatheredScalars); + ShuffleBuilder.add(BV, ReuseMask); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { - ReuseMask.clear(); - copy(VL, Gathered.begin()); + // Gather all constants. + SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem); + for (auto [I, V] : enumerate(E->Scalars)) { + if (!isa<PoisonValue>(V)) + Mask[I] = I; + } + Value *BV = ShuffleBuilder.gather(E->Scalars); + ShuffleBuilder.add(BV, Mask); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } - // Gather unique scalars and all constants. - Value *Vec = gather(Gathered); - ShuffleBuilder.add(Vec, ReuseMask); - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + if (NeedFreeze) - Vec = Builder.CreateFreeze(Vec); - return Vec; + Res = ShuffleBuilder.createFreeze(Res); + return Res; +} + +Value *BoUpSLP::createBuildVector(const TreeEntry *E) { + return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder, + *this); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -9161,10 +10117,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } + if (E->State == TreeEntry::NeedToGather) { + if (E->getMainOp() && E->Idx == 0) + setInsertPointAfterBundle(E); + Value *Vec = createBuildVector(E); + E->VectorizedValue = Vec; + return Vec; + } + auto FinalShuffle = [&](Value *V, const TreeEntry *E) { ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); - if (E->State != TreeEntry::NeedToGather && - E->getOpcode() == Instruction::Store) { + if (E->getOpcode() == Instruction::Store) { ArrayRef<int> Mask = ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), E->ReorderIndices.size()); @@ -9175,45 +10138,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return ShuffleBuilder.finalize(E->ReuseShuffleIndices); }; - if (E->State == TreeEntry::NeedToGather) { - if (E->Idx > 0) { - // We are in the middle of a vectorizable chain. We need to gather the - // scalars from the users. - Value *Vec = createBuildVector(E); - E->VectorizedValue = Vec; - return Vec; - } - if (E->getMainOp()) - setInsertPointAfterBundle(E); - SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector<int> ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - Value *Vec; - SmallVector<int> Mask; - SmallVector<const TreeEntry *> Entries; - std::optional<TargetTransformInfo::ShuffleKind> Shuffle = - isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (Shuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } else { - Vec = gather(E->Scalars); - } - Vec = FinalShuffle(Vec, E); - E->VectorizedValue = Vec; - return Vec; - } - assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -9248,7 +10172,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // PHINodes may have multiple entries from the same block. We want to // visit every block once. - SmallPtrSet<BasicBlock*, 4> VisitedBBs; + SmallPtrSet<BasicBlock *, 4> VisitedBBs; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; @@ -9314,14 +10238,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<int> Mask; if (!E->ReorderIndices.empty()) { inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); + Mask.append(NumElts - NumScalars, PoisonMaskElem); } else { - Mask.assign(NumElts, UndefMaskElem); + Mask.assign(NumElts, PoisonMaskElem); std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); } // Create InsertVector shuffle if necessary bool IsIdentity = true; - SmallVector<int> PrevMask(NumElts, UndefMaskElem); + SmallVector<int> PrevMask(NumElts, PoisonMaskElem); Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { Value *Scalar = E->Scalars[PrevMask[I]]; @@ -9337,9 +10261,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } } - SmallVector<int> InsertMask(NumElts, UndefMaskElem); + SmallVector<int> InsertMask(NumElts, PoisonMaskElem); for (unsigned I = 0; I < NumElts; I++) { - if (Mask[I] != UndefMaskElem) + if (Mask[I] != PoisonMaskElem) InsertMask[Offset + I] = I; } SmallBitVector UseMask = @@ -9354,10 +10278,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { isUndefVector<true>(FirstInsert->getOperand(0), UseMask); if (!IsFirstPoison.all()) { for (unsigned I = 0; I < NumElts; I++) { - if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I)) + if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) InsertMask[I] = I + NumElts; } - } + } V = Builder.CreateShuffleVector( V, IsFirstPoison.all() ? PoisonValue::get(V->getType()) @@ -9372,8 +10296,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallBitVector IsFirstPoison = isUndefVector<true>(FirstInsert->getOperand(0), UseMask); for (unsigned I = 0; I < NumElts; I++) { - if (InsertMask[I] == UndefMaskElem) - InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I; + if (InsertMask[I] == PoisonMaskElem) + InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I; else InsertMask[I] += NumElts; } @@ -9544,20 +10468,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { LoadInst *LI = cast<LoadInst>(VL0); Instruction *NewLI; - unsigned AS = LI->getPointerAddressSpace(); Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { - Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); - NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); + NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); - // The pointer operand uses an in-tree scalar so we add the new BitCast - // or LoadInst to ExternalUses list to make sure that an extract will + // The pointer operand uses an in-tree scalar so we add the new + // LoadInst to ExternalUses list to make sure that an extract will // be generated in the future. if (TreeEntry *Entry = getTreeEntry(PO)) { // Find which lane we need to extract. unsigned FoundLane = Entry->findLaneForValue(PO); - ExternalUses.emplace_back( - PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane); + ExternalUses.emplace_back(PO, NewLI, FoundLane); } } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); @@ -9653,7 +10574,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CallInst *CI = cast<CallInst>(VL0); setInsertPointAfterBundle(E); - Intrinsic::ID IID = Intrinsic::not_intrinsic; + Intrinsic::ID IID = Intrinsic::not_intrinsic; if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); @@ -9665,8 +10586,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; - SmallVector<Type *, 2> TysForDecl = - {FixedVectorType::get(CI->getType(), E->Scalars.size())}; + SmallVector<Type *, 2> TysForDecl; + // Add return type if intrinsic is overloaded on it. + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1)) + TysForDecl.push_back( + FixedVectorType::get(CI->getType(), E->Scalars.size())); for (int j = 0, e = CI->arg_size(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be @@ -9808,14 +10732,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } default: - llvm_unreachable("unknown inst"); + llvm_unreachable("unknown inst"); } return nullptr; } Value *BoUpSLP::vectorizeTree() { ExtraValueToDebugLocsMap ExternallyUsedValues; - return vectorizeTree(ExternallyUsedValues); + SmallVector<std::pair<Value *, Value *>> ReplacedExternals; + return vectorizeTree(ExternallyUsedValues, ReplacedExternals); } namespace { @@ -9829,28 +10754,51 @@ struct ShuffledInsertData { }; } // namespace -Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, - Instruction *ReductionRoot) { +Value *BoUpSLP::vectorizeTree( + const ExtraValueToDebugLocsMap &ExternallyUsedValues, + SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, + Instruction *ReductionRoot) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } - - // Pre-gather last instructions. - for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) { - if ((E->State == TreeEntry::NeedToGather && - (!E->getMainOp() || E->Idx > 0)) || - (E->State != TreeEntry::NeedToGather && - E->getOpcode() == Instruction::ExtractValue) || - E->getOpcode() == Instruction::InsertElement) - continue; - Instruction *LastInst = &getLastInstructionInBundle(E.get()); - EntryToLastInstruction.try_emplace(E.get(), LastInst); - } + // Clean Entry-to-LastInstruction table. It can be affected after scheduling, + // need to rebuild it. + EntryToLastInstruction.clear(); Builder.SetInsertPoint(ReductionRoot ? ReductionRoot : &F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + // Run through the list of postponed gathers and emit them, replacing the temp + // emitted allocas with actual vector instructions. + ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); + DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues; + for (const TreeEntry *E : PostponedNodes) { + auto *TE = const_cast<TreeEntry *>(E); + if (auto *VecTE = getTreeEntry(TE->Scalars.front())) + if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand( + TE->UserTreeIndices.front().EdgeIdx))) + // Found gather node which is absolutely the same as one of the + // vectorized nodes. It may happen after reordering. + continue; + auto *PrevVec = cast<Instruction>(TE->VectorizedValue); + TE->VectorizedValue = nullptr; + auto *UserI = + cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue); + Builder.SetInsertPoint(PrevVec); + Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); + Value *Vec = vectorizeTree(TE); + PrevVec->replaceAllUsesWith(Vec); + PostponedValues.try_emplace(Vec).first->second.push_back(TE); + // Replace the stub vector node, if it was used before for one of the + // buildvector nodes already. + auto It = PostponedValues.find(PrevVec); + if (It != PostponedValues.end()) { + for (TreeEntry *VTE : It->getSecond()) + VTE->VectorizedValue = Vec; + } + eraseInstruction(PrevVec); + } // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We @@ -9968,14 +10916,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, Builder.SetInsertPoint(&F->getEntryBlock().front()); } Value *NewInst = ExtractAndExtendIfNeeded(Vec); - auto &NewInstLocs = ExternallyUsedValues[NewInst]; - auto It = ExternallyUsedValues.find(Scalar); - assert(It != ExternallyUsedValues.end() && - "Externally used scalar is not found in ExternallyUsedValues"); - NewInstLocs.append(It->second); - ExternallyUsedValues.erase(Scalar); // Required to update internally referenced instructions. Scalar->replaceAllUsesWith(NewInst); + ReplacedExternals.emplace_back(Scalar, NewInst); continue; } @@ -10004,7 +10947,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, ShuffledInserts.size() - 1); SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; if (Mask.empty()) - Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask.assign(FTy->getNumElements(), PoisonMaskElem); // Find the insertvector, vectorized in tree, if any. Value *Base = VU; while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { @@ -10017,7 +10960,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, do { IEBase = cast<InsertElementInst>(Base); int IEIdx = *getInsertIndex(IEBase); - assert(Mask[Idx] == UndefMaskElem && + assert(Mask[Idx] == PoisonMaskElem && "InsertElementInstruction used already."); Mask[IEIdx] = IEIdx; Base = IEBase->getOperand(0); @@ -10035,7 +10978,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, } SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; if (Mask.empty()) - Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask.assign(FTy->getNumElements(), PoisonMaskElem); Mask[Idx] = ExternalUse.Lane; It->InsertElements.push_back(cast<InsertElementInst>(User)); continue; @@ -10077,8 +11020,8 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, } auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { - SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); - SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); + SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); + SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); int VF = cast<FixedVectorType>(V1->getType())->getNumElements(); for (int I = 0, E = Mask.size(); I < E; ++I) { if (Mask[I] < VF) @@ -10103,9 +11046,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, return std::make_pair(Vec, true); } if (!ForSingleMask) { - SmallVector<int> ResizeMask(VF, UndefMaskElem); + SmallVector<int> ResizeMask(VF, PoisonMaskElem); for (unsigned I = 0; I < VF; ++I) { - if (Mask[I] != UndefMaskElem) + if (Mask[I] != PoisonMaskElem) ResizeMask[Mask[I]] = Mask[I]; } Vec = CreateShuffle(Vec, nullptr, ResizeMask); @@ -10308,14 +11251,14 @@ void BoUpSLP::optimizeGatherSequence() { // registers. unsigned LastUndefsCnt = 0; for (int I = 0, E = NewMask.size(); I < E; ++I) { - if (SM1[I] == UndefMaskElem) + if (SM1[I] == PoisonMaskElem) ++LastUndefsCnt; else LastUndefsCnt = 0; - if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem && NewMask[I] != SM1[I]) return false; - if (NewMask[I] == UndefMaskElem) + if (NewMask[I] == PoisonMaskElem) NewMask[I] = SM1[I]; } // Check if the last undefs actually change the final number of used vector @@ -10590,11 +11533,20 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. + // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted + // against the budget. Otherwise debug info could affect codegen. BasicBlock::reverse_iterator UpIter = ++ScheduleStart->getIterator().getReverse(); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); BasicBlock::iterator LowerEnd = BB->end(); + auto IsAssumeLikeIntr = [](const Instruction &I) { + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + return II->isAssumeLikeIntrinsic(); + return false; + }; + UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); + DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && &*DownIter != I) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { @@ -10604,6 +11556,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, ++UpIter; ++DownIter; + + UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); + DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); } if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { assert(I->getParent() == ScheduleStart->getParent() && @@ -10804,7 +11759,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, unsigned numAliased = 0; unsigned DistToSrc = 1; - for ( ; DepDest; DepDest = DepDest->NextLoadStore) { + for (; DepDest; DepDest = DepDest->NextLoadStore) { assert(isInSchedulingRegion(DepDest)); // We have two limits to reduce the complexity: @@ -11163,8 +12118,8 @@ void BoUpSLP::computeMinimumValueSizes() { // we can truncate the roots to this narrower type. for (auto *Root : TreeRoot) { auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); - MaxBitWidth = std::max<unsigned>( - Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); + MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(), + MaxBitWidth); } // True if the roots can be zero-extended back to their original type, rather @@ -11223,8 +12178,7 @@ void BoUpSLP::computeMinimumValueSizes() { } // Round MaxBitWidth up to the next power-of-two. - if (!isPowerOf2_64(MaxBitWidth)) - MaxBitWidth = NextPowerOf2(MaxBitWidth); + MaxBitWidth = llvm::bit_ceil(MaxBitWidth); // If the maximum bit width we compute is less than the with of the roots' // type, we can proceed with the narrowing. Otherwise, do nothing. @@ -11242,60 +12196,6 @@ void BoUpSLP::computeMinimumValueSizes() { MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); } -namespace { - -/// The SLPVectorizer Pass. -struct SLPVectorizer : public FunctionPass { - SLPVectorizerPass Impl; - - /// Pass identification, replacement for typeid - static char ID; - - explicit SLPVectorizer() : FunctionPass(ID) { - initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); - } - - bool doInitialization(Module &M) override { return false; } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; - auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); - auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<DemandedBitsWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addRequired<InjectTLIMappingsLegacy>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.setPreservesCFG(); - } -}; - -} // end anonymous namespace - PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); auto *TTI = &AM.getResult<TargetIRAnalysis>(F); @@ -11536,7 +12436,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned MaxVecRegSize = R.getMaxVecRegSize(); unsigned EltSize = R.getVectorElementSize(Operands[0]); - unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); + unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); @@ -11618,17 +12518,8 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { } } -bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { - if (!A || !B) - return false; - if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B)) - return false; - Value *VL[] = {A, B}; - return tryToVectorizeList(VL, R); -} - bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - bool LimitForRegisterSize) { + bool MaxVFOnly) { if (VL.size() < 2) return false; @@ -11663,7 +12554,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = R.getMinVF(Sz); - unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); + unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF); MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { @@ -11690,21 +12581,17 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { - unsigned OpsWidth = 0; + unsigned ActualVF = std::min(MaxInst - I, VF); - if (I + VF > MaxInst) - OpsWidth = MaxInst - I; - else - OpsWidth = VF; - - if (!isPowerOf2_32(OpsWidth)) + if (!isPowerOf2_32(ActualVF)) continue; - if ((LimitForRegisterSize && OpsWidth < MaxVF) || - (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) + if (MaxVFOnly && ActualVF < MaxVF) + break; + if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2)) break; - ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); + ArrayRef<Value *> Ops = VL.slice(I, ActualVF); // Check that a previous iteration of this loop did not delete the Value. if (llvm::any_of(Ops, [&R](Value *V) { auto *I = dyn_cast<Instruction>(V); @@ -11712,7 +12599,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, })) continue; - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " << "\n"); R.buildTree(Ops); @@ -11730,7 +12617,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, MinCost = std::min(MinCost, Cost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost - << " for VF=" << OpsWidth << "\n"); + << " for VF=" << ActualVF << "\n"); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", @@ -11806,14 +12693,14 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { } if (Candidates.size() == 1) - return tryToVectorizePair(Op0, Op1, R); + return tryToVectorizeList({Op0, Op1}, R); // We have multiple options. Try to pick the single best. std::optional<int> BestCandidate = R.findBestRootPair(Candidates); if (!BestCandidate) return false; - return tryToVectorizePair(Candidates[*BestCandidate].first, - Candidates[*BestCandidate].second, R); + return tryToVectorizeList( + {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); } namespace { @@ -11857,6 +12744,9 @@ class HorizontalReduction { WeakTrackingVH ReductionRoot; /// The type of reduction operation. RecurKind RdxKind; + /// Checks if the optimization of original scalar identity operations on + /// matched horizontal reductions is enabled and allowed. + bool IsSupportedHorRdxIdentityOp = false; static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && @@ -11888,6 +12778,9 @@ class HorizontalReduction { return I->getFastMathFlags().noNaNs(); } + if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) + return true; + return I->isAssociative(); } @@ -11905,6 +12798,7 @@ class HorizontalReduction { static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, Value *RHS, const Twine &Name, bool UseSelect) { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); + bool IsConstant = isConstant(LHS) && isConstant(RHS); switch (Kind) { case RecurKind::Or: if (UseSelect && @@ -11926,29 +12820,49 @@ class HorizontalReduction { return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); case RecurKind::FMax: + if (IsConstant) + return ConstantFP::get(LHS->getType(), + maxnum(cast<ConstantFP>(LHS)->getValueAPF(), + cast<ConstantFP>(RHS)->getValueAPF())); return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); case RecurKind::FMin: + if (IsConstant) + return ConstantFP::get(LHS->getType(), + minnum(cast<ConstantFP>(LHS)->getValueAPF(), + cast<ConstantFP>(RHS)->getValueAPF())); return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); + case RecurKind::FMaximum: + if (IsConstant) + return ConstantFP::get(LHS->getType(), + maximum(cast<ConstantFP>(LHS)->getValueAPF(), + cast<ConstantFP>(RHS)->getValueAPF())); + return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS); + case RecurKind::FMinimum: + if (IsConstant) + return ConstantFP::get(LHS->getType(), + minimum(cast<ConstantFP>(LHS)->getValueAPF(), + cast<ConstantFP>(RHS)->getValueAPF())); + return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS); case RecurKind::SMax: - if (UseSelect) { + if (IsConstant || UseSelect) { Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); case RecurKind::SMin: - if (UseSelect) { + if (IsConstant || UseSelect) { Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); case RecurKind::UMax: - if (UseSelect) { + if (IsConstant || UseSelect) { Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); case RecurKind::UMin: - if (UseSelect) { + if (IsConstant || UseSelect) { Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } @@ -11984,6 +12898,7 @@ class HorizontalReduction { return Op; } +public: static RecurKind getRdxKind(Value *V) { auto *I = dyn_cast<Instruction>(V); if (!I) @@ -12010,6 +12925,10 @@ class HorizontalReduction { if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) return RecurKind::FMin; + if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value()))) + return RecurKind::FMaximum; + if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value()))) + return RecurKind::FMinimum; // This matches either cmp+select or intrinsics. SLP is expected to handle // either form. // TODO: If we are canonicalizing to intrinsics, we can remove several @@ -12086,6 +13005,7 @@ class HorizontalReduction { return isCmpSelMinMax(I) ? 1 : 0; } +private: /// Total number of operands in the reduction operation. static unsigned getNumberOfOperands(Instruction *I) { return isCmpSelMinMax(I) ? 3 : 2; @@ -12134,17 +13054,6 @@ class HorizontalReduction { } } - static Value *getLHS(RecurKind Kind, Instruction *I) { - if (Kind == RecurKind::None) - return nullptr; - return I->getOperand(getFirstOperandIndex(I)); - } - static Value *getRHS(RecurKind Kind, Instruction *I) { - if (Kind == RecurKind::None) - return nullptr; - return I->getOperand(getFirstOperandIndex(I) + 1); - } - static bool isGoodForReduction(ArrayRef<Value *> Data) { int Sz = Data.size(); auto *I = dyn_cast<Instruction>(Data.front()); @@ -12156,65 +13065,39 @@ public: HorizontalReduction() = default; /// Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst, + bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL, const TargetLibraryInfo &TLI) { - assert((!Phi || is_contained(Phi->operands(), Inst)) && - "Phi needs to use the binary operator"); - assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || - isa<IntrinsicInst>(Inst)) && - "Expected binop, select, or intrinsic for reduction matching"); - RdxKind = getRdxKind(Inst); - - // We could have a initial reductions that is not an add. - // r *= v1 + v2 + v3 + v4 - // In such a case start looking for a tree rooted in the first '+'. - if (Phi) { - if (getLHS(RdxKind, Inst) == Phi) { - Phi = nullptr; - Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst)); - if (!Inst) - return false; - RdxKind = getRdxKind(Inst); - } else if (getRHS(RdxKind, Inst) == Phi) { - Phi = nullptr; - Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst)); - if (!Inst) - return false; - RdxKind = getRdxKind(Inst); - } - } - - if (!isVectorizable(RdxKind, Inst)) + RdxKind = HorizontalReduction::getRdxKind(Root); + if (!isVectorizable(RdxKind, Root)) return false; // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. - Type *Ty = Inst->getType(); + Type *Ty = Root->getType(); if (!isValidElementType(Ty) || Ty->isPointerTy()) return false; // Though the ultimate reduction may have multiple uses, its condition must // have only single use. - if (auto *Sel = dyn_cast<SelectInst>(Inst)) + if (auto *Sel = dyn_cast<SelectInst>(Root)) if (!Sel->getCondition()->hasOneUse()) return false; - ReductionRoot = Inst; + ReductionRoot = Root; // Iterate through all the operands of the possible reduction tree and // gather all the reduced values, sorting them by their value id. - BasicBlock *BB = Inst->getParent(); - bool IsCmpSelMinMax = isCmpSelMinMax(Inst); - SmallVector<Instruction *> Worklist(1, Inst); + BasicBlock *BB = Root->getParent(); + bool IsCmpSelMinMax = isCmpSelMinMax(Root); + SmallVector<Instruction *> Worklist(1, Root); // Checks if the operands of the \p TreeN instruction are also reduction // operations or should be treated as reduced values or an extra argument, // which is not part of the reduction. - auto &&CheckOperands = [this, IsCmpSelMinMax, - BB](Instruction *TreeN, - SmallVectorImpl<Value *> &ExtraArgs, - SmallVectorImpl<Value *> &PossibleReducedVals, - SmallVectorImpl<Instruction *> &ReductionOps) { + auto CheckOperands = [&](Instruction *TreeN, + SmallVectorImpl<Value *> &ExtraArgs, + SmallVectorImpl<Value *> &PossibleReducedVals, + SmallVectorImpl<Instruction *> &ReductionOps) { for (int I = getFirstOperandIndex(TreeN), End = getNumberOfOperands(TreeN); I < End; ++I) { @@ -12229,10 +13112,14 @@ public: } // If the edge is not an instruction, or it is different from the main // reduction opcode or has too many uses - possible reduced value. + // Also, do not try to reduce const values, if the operation is not + // foldable. if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || - !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { + !isVectorizable(RdxKind, EdgeInst) || + (R.isAnalyzedReductionRoot(EdgeInst) && + all_of(EdgeInst->operands(), Constant::classof))) { PossibleReducedVals.push_back(EdgeVal); continue; } @@ -12246,10 +13133,43 @@ public: // instructions (grouping them by the predicate). MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> PossibleReducedVals; - initReductionOps(Inst); + initReductionOps(Root); DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap; SmallSet<size_t, 2> LoadKeyUsed; SmallPtrSet<Value *, 4> DoNotReverseVals; + + auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { + Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); + if (LoadKeyUsed.contains(Key)) { + auto LIt = LoadsMap.find(Ptr); + if (LIt != LoadsMap.end()) { + for (LoadInst *RLI : LIt->second) { + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), DL, SE, + /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + for (LoadInst *RLI : LIt->second) { + if (arePointersCompatible(RLI->getPointerOperand(), + LI->getPointerOperand(), TLI)) { + hash_code SubKey = hash_value(RLI->getPointerOperand()); + DoNotReverseVals.insert(RLI); + return SubKey; + } + } + if (LIt->second.size() > 2) { + hash_code SubKey = + hash_value(LIt->second.back()->getPointerOperand()); + DoNotReverseVals.insert(LIt->second.back()); + return SubKey; + } + } + } + LoadKeyUsed.insert(Key); + LoadsMap.try_emplace(Ptr).first->second.push_back(LI); + return hash_value(LI->getPointerOperand()); + }; + while (!Worklist.empty()) { Instruction *TreeN = Worklist.pop_back_val(); SmallVector<Value *> Args; @@ -12269,41 +13189,8 @@ public: // results. for (Value *V : PossibleRedVals) { size_t Key, Idx; - std::tie(Key, Idx) = generateKeySubkey( - V, &TLI, - [&](size_t Key, LoadInst *LI) { - Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); - if (LoadKeyUsed.contains(Key)) { - auto LIt = LoadsMap.find(Ptr); - if (LIt != LoadsMap.end()) { - for (LoadInst *RLI: LIt->second) { - if (getPointersDiff( - RLI->getType(), RLI->getPointerOperand(), - LI->getType(), LI->getPointerOperand(), DL, SE, - /*StrictCheck=*/true)) - return hash_value(RLI->getPointerOperand()); - } - for (LoadInst *RLI : LIt->second) { - if (arePointersCompatible(RLI->getPointerOperand(), - LI->getPointerOperand(), TLI)) { - hash_code SubKey = hash_value(RLI->getPointerOperand()); - DoNotReverseVals.insert(RLI); - return SubKey; - } - } - if (LIt->second.size() > 2) { - hash_code SubKey = - hash_value(LIt->second.back()->getPointerOperand()); - DoNotReverseVals.insert(LIt->second.back()); - return SubKey; - } - } - } - LoadKeyUsed.insert(Key); - LoadsMap.try_emplace(Ptr).first->second.push_back(LI); - return hash_value(LI->getPointerOperand()); - }, - /*AllowAlternate=*/false); + std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, + /*AllowAlternate=*/false); ++PossibleReducedVals[Key][Idx] .insert(std::make_pair(V, 0)) .first->second; @@ -12312,40 +13199,8 @@ public: PossibleReductionOps.rend()); } else { size_t Key, Idx; - std::tie(Key, Idx) = generateKeySubkey( - TreeN, &TLI, - [&](size_t Key, LoadInst *LI) { - Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); - if (LoadKeyUsed.contains(Key)) { - auto LIt = LoadsMap.find(Ptr); - if (LIt != LoadsMap.end()) { - for (LoadInst *RLI: LIt->second) { - if (getPointersDiff(RLI->getType(), - RLI->getPointerOperand(), LI->getType(), - LI->getPointerOperand(), DL, SE, - /*StrictCheck=*/true)) - return hash_value(RLI->getPointerOperand()); - } - for (LoadInst *RLI : LIt->second) { - if (arePointersCompatible(RLI->getPointerOperand(), - LI->getPointerOperand(), TLI)) { - hash_code SubKey = hash_value(RLI->getPointerOperand()); - DoNotReverseVals.insert(RLI); - return SubKey; - } - } - if (LIt->second.size() > 2) { - hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand()); - DoNotReverseVals.insert(LIt->second.back()); - return SubKey; - } - } - } - LoadKeyUsed.insert(Key); - LoadsMap.try_emplace(Ptr).first->second.push_back(LI); - return hash_value(LI->getPointerOperand()); - }, - /*AllowAlternate=*/false); + std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey, + /*AllowAlternate=*/false); ++PossibleReducedVals[Key][Idx] .insert(std::make_pair(TreeN, 0)) .first->second; @@ -12407,14 +13262,18 @@ public: // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - size_t NumReducedVals = + unsigned NumReducedVals = std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0, - [](size_t Num, ArrayRef<Value *> Vals) { + [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned { if (!isGoodForReduction(Vals)) return Num; return Num + Vals.size(); }); - if (NumReducedVals < ReductionLimit) { + if (NumReducedVals < ReductionLimit && + (!AllowHorRdxIdenityOptimization || + all_of(ReducedVals, [](ArrayRef<Value *> RedV) { + return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV); + }))) { for (ReductionOpsType &RdxOps : ReductionOps) for (Value *RdxOp : RdxOps) V.analyzedReductionRoot(cast<Instruction>(RdxOp)); @@ -12428,6 +13287,7 @@ public: DenseMap<Value *, WeakTrackingVH> TrackedVals( ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size()); BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; + SmallVector<std::pair<Value *, Value *>> ReplacedExternals; ExternallyUsedValues.reserve(ExtraArgs.size() + 1); // The same extra argument may be used several times, so log each attempt // to use it. @@ -12448,6 +13308,18 @@ public: return cast<Instruction>(ScalarCond); }; + // Return new VectorizedTree, based on previous value. + auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { + if (VectorizedTree) { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); + return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", + ReductionOps); + } + // Initialize the final value in the reduction. + return Res; + }; // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; @@ -12459,6 +13331,12 @@ public: continue; IgnoreList.insert(RdxOp); } + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (Value *U : IgnoreList) + if (auto *FPMO = dyn_cast<FPMathOperator>(U)) + RdxFMF &= FPMO->getFastMathFlags(); bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); // Need to track reduced vals, they may be changed during vectorization of @@ -12519,16 +13397,82 @@ public: } } } + + // Emit code for constant values. + if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 && + allConstant(Candidates)) { + Value *Res = Candidates.front(); + ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); + for (Value *VC : ArrayRef(Candidates).drop_front()) { + Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps); + ++VectorizedVals.try_emplace(VC, 0).first->getSecond(); + if (auto *ResI = dyn_cast<Instruction>(Res)) + V.analyzedReductionRoot(ResI); + } + VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res); + continue; + } + unsigned NumReducedVals = Candidates.size(); - if (NumReducedVals < ReductionLimit) + if (NumReducedVals < ReductionLimit && + (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization || + !isSplat(Candidates))) continue; + // Check if we support repeated scalar values processing (optimization of + // original scalar identity operations on matched horizontal reductions). + IsSupportedHorRdxIdentityOp = + AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul && + RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd; + // Gather same values. + MapVector<Value *, unsigned> SameValuesCounter; + if (IsSupportedHorRdxIdentityOp) + for (Value *V : Candidates) + ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; + // Used to check if the reduced values used same number of times. In this + // case the compiler may produce better code. E.g. if reduced values are + // aabbccdd (8 x values), then the first node of the tree will have a node + // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. + // Plus, the final reduction will be performed on <8 x aabbccdd>. + // Instead compiler may build <4 x abcd> tree immediately, + reduction (4 + // x abcd) * 2. + // Currently it only handles add/fadd/xor. and/or/min/max do not require + // this analysis, other operations may require an extra estimation of + // the profitability. + bool SameScaleFactor = false; + bool OptReusedScalars = IsSupportedHorRdxIdentityOp && + SameValuesCounter.size() != Candidates.size(); + if (OptReusedScalars) { + SameScaleFactor = + (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || + RdxKind == RecurKind::Xor) && + all_of(drop_begin(SameValuesCounter), + [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { + return P.second == SameValuesCounter.front().second; + }); + Candidates.resize(SameValuesCounter.size()); + transform(SameValuesCounter, Candidates.begin(), + [](const auto &P) { return P.first; }); + NumReducedVals = Candidates.size(); + // Have a reduction of the same element. + if (NumReducedVals == 1) { + Value *OrigV = TrackedToOrig.find(Candidates.front())->second; + unsigned Cnt = SameValuesCounter.lookup(OrigV); + Value *RedVal = + emitScaleForReusedOps(Candidates.front(), Builder, Cnt); + VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); + VectorizedVals.try_emplace(OrigV, Cnt); + continue; + } + } + unsigned MaxVecRegSize = V.getMaxVecRegSize(); unsigned EltSize = V.getVectorElementSize(Candidates[0]); - unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); + unsigned MaxElts = + RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize); unsigned ReduxWidth = std::min<unsigned>( - PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); + llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); unsigned Start = 0; unsigned Pos = Start; // Restarts vectorization attempt with lower vector factor. @@ -12551,6 +13495,7 @@ public: ReduxWidth /= 2; return IsAnyRedOpGathered; }; + bool AnyVectorized = false; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -12603,34 +13548,24 @@ public: LocalExternallyUsedValues[TrackedVals[V]]; }); } - // Number of uses of the candidates in the vector of values. - SmallDenseMap<Value *, unsigned> NumUses(Candidates.size()); - for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { - Value *V = Candidates[Cnt]; - ++NumUses.try_emplace(V, 0).first->getSecond(); - } - for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { - Value *V = Candidates[Cnt]; - ++NumUses.try_emplace(V, 0).first->getSecond(); + if (!IsSupportedHorRdxIdentityOp) { + // Number of uses of the candidates in the vector of values. + assert(SameValuesCounter.empty() && + "Reused values counter map is not empty"); + for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { + if (Cnt >= Pos && Cnt < Pos + ReduxWidth) + continue; + Value *V = Candidates[Cnt]; + Value *OrigV = TrackedToOrig.find(V)->second; + ++SameValuesCounter[OrigV]; + } } SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end()); // Gather externally used values. SmallPtrSet<Value *, 4> Visited; - for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { - Value *RdxVal = Candidates[Cnt]; - if (!Visited.insert(RdxVal).second) + for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { + if (Cnt >= Pos && Cnt < Pos + ReduxWidth) continue; - // Check if the scalar was vectorized as part of the vectorization - // tree but not the top node. - if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { - LocalExternallyUsedValues[RdxVal]; - continue; - } - unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; - if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) - LocalExternallyUsedValues[RdxVal]; - } - for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { Value *RdxVal = Candidates[Cnt]; if (!Visited.insert(RdxVal).second) continue; @@ -12640,42 +13575,34 @@ public: LocalExternallyUsedValues[RdxVal]; continue; } - unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; - if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) + Value *OrigV = TrackedToOrig.find(RdxVal)->second; + unsigned NumOps = + VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; + if (NumOps != ReducedValsToOps.find(OrigV)->second.size()) LocalExternallyUsedValues[RdxVal]; } + // Do not need the list of reused scalars in regular mode anymore. + if (!IsSupportedHorRdxIdentityOp) + SameValuesCounter.clear(); for (Value *RdxVal : VL) if (RequiredExtract.contains(RdxVal)) LocalExternallyUsedValues[RdxVal]; + // Update LocalExternallyUsedValues for the scalar, replaced by + // extractelement instructions. + for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) { + auto It = ExternallyUsedValues.find(Pair.first); + if (It == ExternallyUsedValues.end()) + continue; + LocalExternallyUsedValues[Pair.second].append(It->second); + } V.buildExternalUses(LocalExternallyUsedValues); V.computeMinimumValueSizes(); - // Intersect the fast-math-flags from all reduction operations. - FastMathFlags RdxFMF; - RdxFMF.set(); - for (Value *U : IgnoreList) - if (auto *FPMO = dyn_cast<FPMathOperator>(U)) - RdxFMF &= FPMO->getFastMathFlags(); // Estimate cost. InstructionCost TreeCost = V.getTreeCost(VL); InstructionCost ReductionCost = - getReductionCost(TTI, VL, ReduxWidth, RdxFMF); - if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) { - Instruction *MainOp = V.getFirstNodeMainOp(); - for (Value *V : VL) { - auto *VI = dyn_cast<LoadInst>(V); - // Add the costs of scalar GEP pointers, to be removed from the - // code. - if (!VI || VI == MainOp) - continue; - auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand()); - if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) - continue; - TreeCost -= TTI->getArithmeticInstrCost( - Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput); - } - } + getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); if (!Cost.isValid()) @@ -12716,8 +13643,8 @@ public: InsertPt = GetCmpForMinMaxReduction(RdxRootInst); // Vectorize a tree. - Value *VectorizedRoot = - V.vectorizeTree(LocalExternallyUsedValues, InsertPt); + Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, + ReplacedExternals, InsertPt); Builder.SetInsertPoint(InsertPt); @@ -12727,29 +13654,48 @@ public: if (isBoolLogicOp(RdxRootInst)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // Emit code to correctly handle reused reduced values, if required. + if (OptReusedScalars && !SameScaleFactor) { + VectorizedRoot = + emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(), + SameValuesCounter, TrackedToOrig); + } + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (!VectorizedTree) { - // Initialize the final value in the reduction. - VectorizedTree = ReducedSubTree; - } else { - // Update the final value in the reduction. - Builder.SetCurrentDebugLocation( - cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - ReducedSubTree, "op.rdx", ReductionOps); - } + // Improved analysis for add/fadd/xor reductions with same scale factor + // for all operands of reductions. We can emit scalar ops for them + // instead. + if (OptReusedScalars && SameScaleFactor) + ReducedSubTree = emitScaleForReusedOps( + ReducedSubTree, Builder, SameValuesCounter.front().second); + + VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { - ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0) - .first->getSecond(); + Value *OrigV = TrackedToOrig.find(RdxVal)->second; + if (IsSupportedHorRdxIdentityOp) { + VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]); + continue; + } + ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); if (!V.isVectorized(RdxVal)) RequiredExtract.insert(RdxVal); } Pos += ReduxWidth; Start = Pos; - ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); + ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); + AnyVectorized = true; + } + if (OptReusedScalars && !AnyVectorized) { + for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { + Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second); + VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); + Value *OrigV = TrackedToOrig.find(P.first)->second; + VectorizedVals.try_emplace(OrigV, P.second); + } + continue; } } if (VectorizedTree) { @@ -12757,7 +13703,7 @@ public: // possible problem with poison propagation. If not possible to reorder // (both operands are originally RHS), emit an extra freeze instruction // for the LHS operand. - //I.e., if we have original code like this: + // I.e., if we have original code like this: // RedOp1 = select i1 ?, i1 LHS, i1 false // RedOp2 = select i1 RHS, i1 ?, i1 false @@ -12892,7 +13838,8 @@ private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals, - unsigned ReduxWidth, FastMathFlags FMF) { + bool IsCmpSelMinMax, unsigned ReduxWidth, + FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Value *FirstReducedVal = ReducedVals.front(); Type *ScalarTy = FirstReducedVal->getType(); @@ -12900,7 +13847,36 @@ private: InstructionCost VectorCost = 0, ScalarCost; // If all of the reduced values are constant, the vector cost is 0, since // the reduction value can be calculated at the compile time. - bool AllConsts = all_of(ReducedVals, isConstant); + bool AllConsts = allConstant(ReducedVals); + auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) { + InstructionCost Cost = 0; + // Scalar cost is repeated for N-1 elements. + int Cnt = ReducedVals.size(); + for (Value *RdxVal : ReducedVals) { + if (Cnt == 1) + break; + --Cnt; + if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) { + Cost += GenCostFn(); + continue; + } + InstructionCost ScalarCost = 0; + for (User *U : RdxVal->users()) { + auto *RdxOp = cast<Instruction>(U); + if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); + continue; + } + ScalarCost = InstructionCost::getInvalid(); + break; + } + if (ScalarCost.isValid()) + Cost += ScalarCost; + else + Cost += GenCostFn(); + } + return Cost; + }; switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -12913,52 +13889,32 @@ private: if (!AllConsts) VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); - ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + }); break; } case RecurKind::FMax: - case RecurKind::FMin: { - auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - if (!AllConsts) { - auto *VecCondTy = - cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*IsUnsigned=*/false, CostKind); - } - CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); - break; - } + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: { - auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - if (!AllConsts) { - auto *VecCondTy = - cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - bool IsUnsigned = - RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - IsUnsigned, CostKind); - } - CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); + Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); + if (!AllConsts) + VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); + return TTI->getIntrinsicInstrCost(ICA, CostKind); + }); break; } default: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - // Scalar cost is repeated for N-1 elements. - ScalarCost *= (ReduxWidth - 1); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost << " for reduction that starts with " << *FirstReducedVal << " (It is a splitting reduction)\n"); @@ -12977,8 +13933,148 @@ private: ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } -}; + /// Emits optimized code for unique scalar value reused \p Cnt times. + Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, + unsigned Cnt) { + assert(IsSupportedHorRdxIdentityOp && + "The optimization of matched scalar identity horizontal reductions " + "must be supported."); + switch (RdxKind) { + case RecurKind::Add: { + // res = mul vv, n + Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); + LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateMul(VectorizedValue, Scale); + } + case RecurKind::Xor: { + // res = n % 2 ? 0 : vv + LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue + << ". (HorRdx)\n"); + if (Cnt % 2 == 0) + return Constant::getNullValue(VectorizedValue->getType()); + return VectorizedValue; + } + case RecurKind::FAdd: { + // res = fmul v, n + Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); + LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateFMul(VectorizedValue, Scale); + } + case RecurKind::And: + case RecurKind::Or: + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: + // res = vv + return VectorizedValue; + case RecurKind::Mul: + case RecurKind::FMul: + case RecurKind::FMulAdd: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + case RecurKind::None: + llvm_unreachable("Unexpected reduction kind for repeated scalar."); + } + return nullptr; + } + + /// Emits actual operation for the scalar identity values, found during + /// horizontal reduction analysis. + Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, + ArrayRef<Value *> VL, + const MapVector<Value *, unsigned> &SameValuesCounter, + const DenseMap<Value *, Value *> &TrackedToOrig) { + assert(IsSupportedHorRdxIdentityOp && + "The optimization of matched scalar identity horizontal reductions " + "must be supported."); + switch (RdxKind) { + case RecurKind::Add: { + // root = mul prev_root, <1, 1, n, 1> + SmallVector<Constant *> Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); + } + auto *Scale = ConstantVector::get(Vals); + LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateMul(VectorizedValue, Scale); + } + case RecurKind::And: + case RecurKind::Or: + // No need for multiple or/and(s). + LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue + << ". (HorRdx)\n"); + return VectorizedValue; + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: + // No need for multiple min/max(s) of the same value. + LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue + << ". (HorRdx)\n"); + return VectorizedValue; + case RecurKind::Xor: { + // Replace values with even number of repeats with 0, since + // x xor x = 0. + // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, + // 7>, if elements 4th and 6th elements have even number of repeats. + SmallVector<int> Mask( + cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(), + PoisonMaskElem); + std::iota(Mask.begin(), Mask.end(), 0); + bool NeedShuffle = false; + for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { + Value *V = VL[I]; + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + if (Cnt % 2 == 0) { + Mask[I] = VF; + NeedShuffle = true; + } + } + LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I + : Mask) dbgs() + << I << " "; + dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n"); + if (NeedShuffle) + VectorizedValue = Builder.CreateShuffleVector( + VectorizedValue, + ConstantVector::getNullValue(VectorizedValue->getType()), Mask); + return VectorizedValue; + } + case RecurKind::FAdd: { + // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> + SmallVector<Constant *> Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantFP::get(V->getType(), Cnt)); + } + auto *Scale = ConstantVector::get(Vals); + return Builder.CreateFMul(VectorizedValue, Scale); + } + case RecurKind::Mul: + case RecurKind::FMul: + case RecurKind::FMulAdd: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + case RecurKind::None: + llvm_unreachable("Unexpected reduction kind for reused scalars."); + } + return nullptr; + } +}; } // end anonymous namespace static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { @@ -13075,15 +14171,15 @@ static bool findBuildAggregate(Instruction *LastInsertInst, return false; } -/// Try and get a reduction value from a phi node. +/// Try and get a reduction instruction from a phi node. /// /// Given a phi node \p P in a block \p ParentBB, consider possible reductions /// if they come from either \p ParentBB or a containing loop latch. /// /// \returns A candidate reduction value if possible, or \code nullptr \endcode /// if not possible. -static Value *getReductionValue(const DominatorTree *DT, PHINode *P, - BasicBlock *ParentBB, LoopInfo *LI) { +static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P, + BasicBlock *ParentBB, LoopInfo *LI) { // There are situations where the reduction value is not dominated by the // reduction phi. Vectorizing such cases has been reported to cause // miscompiles. See PR25787. @@ -13092,13 +14188,13 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); }; - Value *Rdx = nullptr; + Instruction *Rdx = nullptr; // Return the incoming value if it comes from the same BB as the phi node. if (P->getIncomingBlock(0) == ParentBB) { - Rdx = P->getIncomingValue(0); + Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); } else if (P->getIncomingBlock(1) == ParentBB) { - Rdx = P->getIncomingValue(1); + Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); } if (Rdx && DominatedReduxValue(Rdx)) @@ -13115,9 +14211,9 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, // There is a loop latch, return the incoming value if it comes from // that. This reduction pattern occasionally turns up. if (P->getIncomingBlock(0) == BBLatch) { - Rdx = P->getIncomingValue(0); + Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); } else if (P->getIncomingBlock(1) == BBLatch) { - Rdx = P->getIncomingValue(1); + Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); } if (Rdx && DominatedReduxValue(Rdx)) @@ -13133,6 +14229,10 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { return true; if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) return true; + if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1)))) + return true; if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) return true; if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) @@ -13144,21 +14244,63 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { return false; } +/// We could have an initial reduction that is not an add. +/// r *= v1 + v2 + v3 + v4 +/// In such a case start looking for a tree rooted in the first '+'. +/// \Returns the new root if found, which may be nullptr if not an instruction. +static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi, + Instruction *Root) { + assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) || + isa<IntrinsicInst>(Root)) && + "Expected binop, select, or intrinsic for reduction matching"); + Value *LHS = + Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root)); + Value *RHS = + Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1); + if (LHS == Phi) + return dyn_cast<Instruction>(RHS); + if (RHS == Phi) + return dyn_cast<Instruction>(LHS); + return nullptr; +} + +/// \p Returns the first operand of \p I that does not match \p Phi. If +/// operand is not an instruction it returns nullptr. +static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) { + Value *Op0 = nullptr; + Value *Op1 = nullptr; + if (!matchRdxBop(I, Op0, Op1)) + return nullptr; + return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0); +} + +/// \Returns true if \p I is a candidate instruction for reduction vectorization. +static bool isReductionCandidate(Instruction *I) { + bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value())); + Value *B0 = nullptr, *B1 = nullptr; + bool IsBinop = matchRdxBop(I, B0, B1); + return IsBinop || IsSelect; +} + bool SLPVectorizerPass::vectorizeHorReduction( - PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, + PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { if (!ShouldVectorizeHor) return false; + bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root); - auto *Root = dyn_cast_or_null<Instruction>(V); - if (!Root) + if (Root->getParent() != BB || isa<PHINode>(Root)) return false; - if (!isa<BinaryOperator>(Root)) - P = nullptr; + // If we can find a secondary reduction root, use that instead. + auto SelectRoot = [&]() { + if (TryOperandsAsNewSeeds && isReductionCandidate(Root) && + HorizontalReduction::getRdxKind(Root) != RecurKind::None) + if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root)) + return NewRoot; + return Root; + }; - if (Root->getParent() != BB || isa<PHINode>(Root)) - return false; // Start analysis starting from Root instruction. If horizontal reduction is // found, try to vectorize it. If it is not a horizontal reduction or // vectorization is not possible or not effective, and currently analyzed @@ -13171,22 +14313,32 @@ bool SLPVectorizerPass::vectorizeHorReduction( // If a horizintal reduction was not matched or vectorized we collect // instructions for possible later attempts for vectorization. std::queue<std::pair<Instruction *, unsigned>> Stack; - Stack.emplace(Root, 0); + Stack.emplace(SelectRoot(), 0); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; - auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; - bool IsBinop = matchRdxBop(Inst, B0, B1); - bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); - if (IsBinop || IsSelect) { - HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI)) - return HorRdx.tryToReduce(R, TTI, *TLI); + if (!isReductionCandidate(Inst)) + return nullptr; + HorizontalReduction HorRdx; + if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) + return nullptr; + return HorRdx.tryToReduce(R, TTI, *TLI); + }; + auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { + if (TryOperandsAsNewSeeds && FutureSeed == Root) { + FutureSeed = getNonPhiOperand(Root, P); + if (!FutureSeed) + return false; } - return nullptr; + // Do not collect CmpInst or InsertElementInst/InsertValueInst as their + // analysis is done separately. + if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed)) + PostponedInsts.push_back(FutureSeed); + return true; }; + while (!Stack.empty()) { Instruction *Inst; unsigned Level; @@ -13197,37 +14349,19 @@ bool SLPVectorizerPass::vectorizeHorReduction( // iteration while stack was populated before that happened. if (R.isDeleted(Inst)) continue; - Value *B0 = nullptr, *B1 = nullptr; - if (Value *V = TryToReduce(Inst, B0, B1)) { + if (Value *VectorizedV = TryToReduce(Inst)) { Res = true; - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - if (auto *I = dyn_cast<Instruction>(V)) { + if (auto *I = dyn_cast<Instruction>(VectorizedV)) { // Try to find another reduction. Stack.emplace(I, Level); continue; } } else { - bool IsBinop = B0 && B1; - if (P && IsBinop) { - Inst = dyn_cast<Instruction>(B0); - if (Inst == P) - Inst = dyn_cast<Instruction>(B1); - if (!Inst) { - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - continue; - } + // We could not vectorize `Inst` so try to use it as a future seed. + if (!TryAppendToPostponedInsts(Inst)) { + assert(Stack.empty() && "Expected empty stack"); + break; } - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - // Do not collect CmpInst or InsertElementInst/InsertValueInst as their - // analysis is done separately. - if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst)) - PostponedInsts.push_back(Inst); } // Try to vectorize operands. @@ -13246,11 +14380,11 @@ bool SLPVectorizerPass::vectorizeHorReduction( return Res; } -bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, +bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { SmallVector<WeakTrackingVH> PostponedInsts; - bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts); + bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts); Res |= tryToVectorize(PostponedInsts, R); return Res; } @@ -13297,13 +14431,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, } template <typename T> -static bool -tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, - function_ref<unsigned(T *)> Limit, - function_ref<bool(T *, T *)> Comparator, - function_ref<bool(T *, T *)> AreCompatible, - function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, - bool LimitForRegisterSize) { +static bool tryToVectorizeSequence( + SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, + function_ref<bool(T *, T *)> AreCompatible, + function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, + bool MaxVFOnly, BoUpSLP &R) { bool Changed = false; // Sort by type, parent, operands. stable_sort(Incoming, Comparator); @@ -13331,21 +14463,29 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // same/alternate ops only, this may result in some extra final // vectorization. if (NumElts > 1 && - TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) { // Success start over because instructions might have been changed. Changed = true; - } else if (NumElts < Limit(*IncIt) && - (Candidates.empty() || - Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); + } else { + /// \Returns the minimum number of elements that we will attempt to + /// vectorize. + auto GetMinNumElements = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; + if (NumElts < GetMinNumElements(*IncIt) && + (Candidates.empty() || + Candidates.front()->getType() == (*IncIt)->getType())) { + Candidates.append(IncIt, std::next(IncIt, NumElts)); + } } // Final attempt to vectorize instructions with the same types. if (Candidates.size() > 1 && (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) { + if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { // Success start over because instructions might have been changed. Changed = true; - } else if (LimitForRegisterSize) { + } else if (MaxVFOnly) { // Try to vectorize using small vectors. for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;) { @@ -13353,9 +14493,8 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) ++SameTypeIt; unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && - TryToVectorizeHelper(ArrayRef(It, NumElts), - /*LimitForRegisterSize=*/false)) + if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts), + /*MaxVFOnly=*/false)) Changed = true; It = SameTypeIt; } @@ -13378,11 +14517,12 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, /// of the second cmp instruction. template <bool IsCompatibility> static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, - function_ref<bool(Instruction *)> IsDeleted) { + const DominatorTree &DT) { + assert(isValidElementType(V->getType()) && + isValidElementType(V2->getType()) && + "Expected valid element types only."); auto *CI1 = cast<CmpInst>(V); auto *CI2 = cast<CmpInst>(V2); - if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; if (CI1->getOperand(0)->getType()->getTypeID() < CI2->getOperand(0)->getType()->getTypeID()) return !IsCompatibility; @@ -13411,31 +14551,102 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, return false; if (auto *I1 = dyn_cast<Instruction>(Op1)) if (auto *I2 = dyn_cast<Instruction>(Op2)) { - if (I1->getParent() != I2->getParent()) - return false; + if (IsCompatibility) { + if (I1->getParent() != I2->getParent()) + return false; + } else { + // Try to compare nodes with same parent. + DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent()); + DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent()); + if (!NodeI1) + return NodeI2 != nullptr; + if (!NodeI2) + return false; + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + } InstructionsState S = getSameOpcode({I1, I2}, TLI); - if (S.getOpcode()) + if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) continue; - return false; + return !IsCompatibility && I1->getOpcode() < I2->getOpcode(); } } return IsCompatibility; } -bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions, - BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { +template <typename ItT> +bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, + BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + // Try to find reductions first. + for (CmpInst *I : CmpInsts) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + if (auto *RootOp = dyn_cast<Instruction>(Op)) + Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (CmpInst *I : CmpInsts) { + if (R.isDeleted(I)) + continue; + Changed |= tryToVectorize(I, R); + } + // Try to vectorize list of compares. + // Sort by type, compare predicate, etc. + auto CompareSorter = [&](Value *V, Value *V2) { + if (V == V2) + return false; + return compareCmp<false>(V, V2, *TLI, *DT); + }; + + auto AreCompatibleCompares = [&](Value *V1, Value *V2) { + if (V1 == V2) + return true; + return compareCmp<true>(V1, V2, *TLI, *DT); + }; + + SmallVector<Value *> Vals; + for (Instruction *V : CmpInsts) + if (!R.isDeleted(V) && isValidElementType(V->getType())) + Vals.push_back(V); + if (Vals.size() <= 1) + return Changed; + Changed |= tryToVectorizeSequence<Value>( + Vals, CompareSorter, AreCompatibleCompares, + [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { + // Exclude possible reductions from other blocks. + bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) { + return any_of(V->users(), [V](User *U) { + auto *Select = dyn_cast<SelectInst>(U); + return Select && + Select->getParent() != cast<Instruction>(V)->getParent(); + }); + }); + if (ArePossiblyReducedInOtherBlock) + return false; + return tryToVectorizeList(Candidates, R, MaxVFOnly); + }, + /*MaxVFOnly=*/true, R); + return Changed; +} + +bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, + BasicBlock *BB, BoUpSLP &R) { + assert(all_of(Instructions, + [](auto *I) { + return isa<InsertElementInst, InsertValueInst>(I); + }) && + "This function only accepts Insert instructions"); bool OpsChanged = false; - SmallVector<Instruction *, 4> PostponedCmps; SmallVector<WeakTrackingVH> PostponedInsts; // pass1 - try to vectorize reductions only for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; - if (isa<CmpInst>(I)) { - PostponedCmps.push_back(I); - continue; - } OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts); } // pass2 - try to match and vectorize a buildvector sequence. @@ -13451,63 +14662,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions, // Now try to vectorize postponed instructions. OpsChanged |= tryToVectorize(PostponedInsts, R); - if (AtTerminator) { - // Try to find reductions first. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - for (Value *Op : I->operands()) - OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); - } - // Try to vectorize operands as vector bundles. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - OpsChanged |= tryToVectorize(I, R); - } - // Try to vectorize list of compares. - // Sort by type, compare predicate, etc. - auto CompareSorter = [&](Value *V, Value *V2) { - return compareCmp<false>(V, V2, *TLI, - [&R](Instruction *I) { return R.isDeleted(I); }); - }; - - auto AreCompatibleCompares = [&](Value *V1, Value *V2) { - if (V1 == V2) - return true; - return compareCmp<true>(V1, V2, *TLI, - [&R](Instruction *I) { return R.isDeleted(I); }); - }; - auto Limit = [&R](Value *V) { - unsigned EltSize = R.getVectorElementSize(V); - return std::max(2U, R.getMaxVecRegSize() / EltSize); - }; - - SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); - OpsChanged |= tryToVectorizeSequence<Value>( - Vals, Limit, CompareSorter, AreCompatibleCompares, - [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { - // Exclude possible reductions from other blocks. - bool ArePossiblyReducedInOtherBlock = - any_of(Candidates, [](Value *V) { - return any_of(V->users(), [V](User *U) { - return isa<SelectInst>(U) && - cast<SelectInst>(U)->getParent() != - cast<Instruction>(V)->getParent(); - }); - }); - if (ArePossiblyReducedInOtherBlock) - return false; - return tryToVectorizeList(Candidates, R, LimitForRegisterSize); - }, - /*LimitForRegisterSize=*/true); - Instructions.clear(); - } else { - Instructions.clear(); - // Insert in reverse order since the PostponedCmps vector was filled in - // reverse order. - Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend()); - } + Instructions.clear(); return OpsChanged; } @@ -13603,10 +14758,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return true; }; - auto Limit = [&R](Value *V) { - unsigned EltSize = R.getVectorElementSize(V); - return std::max(2U, R.getMaxVecRegSize() / EltSize); - }; bool HaveVectorizedPhiNodes = false; do { @@ -13648,19 +14799,44 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( - Incoming, Limit, PHICompare, AreCompatiblePHIs, - [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { - return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + Incoming, PHICompare, AreCompatiblePHIs, + [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { + return tryToVectorizeList(Candidates, R, MaxVFOnly); }, - /*LimitForRegisterSize=*/true); + /*MaxVFOnly=*/true, R); Changed |= HaveVectorizedPhiNodes; VisitedInstrs.insert(Incoming.begin(), Incoming.end()); } while (HaveVectorizedPhiNodes); VisitedInstrs.clear(); - InstSetVector PostProcessInstructions; - SmallDenseSet<Instruction *, 4> KeyNodes; + InstSetVector PostProcessInserts; + SmallSetVector<CmpInst *, 8> PostProcessCmps; + // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true + // also vectorizes `PostProcessCmps`. + auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { + bool Changed = vectorizeInserts(PostProcessInserts, BB, R); + if (VectorizeCmps) { + Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R); + PostProcessCmps.clear(); + } + PostProcessInserts.clear(); + return Changed; + }; + // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. + auto IsInPostProcessInstrs = [&](Instruction *I) { + if (auto *Cmp = dyn_cast<CmpInst>(I)) + return PostProcessCmps.contains(Cmp); + return isa<InsertElementInst, InsertValueInst>(I) && + PostProcessInserts.contains(I); + }; + // Returns true if `I` is an instruction without users, like terminator, or + // function call with ignored return value, store. Ignore unused instructions + // (basing on instruction type, except for CallInst and InvokeInst). + auto HasNoUsers = [](Instruction *I) { + return I->use_empty() && + (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I)); + }; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Skip instructions with scalable type. The num of elements is unknown at // compile-time for scalable type. @@ -13672,9 +14848,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { - if (it->use_empty() && KeyNodes.contains(&*it) && - vectorizeSimpleInstructions(PostProcessInstructions, BB, R, - it->isTerminator())) { + if (HasNoUsers(&*it) && + VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; @@ -13692,8 +14867,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() == 2) { // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { + Instruction *Root = getReductionInstr(DT, P, BB, LI); + if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -13714,19 +14889,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Postponed instructions should not be vectorized here, delay their // vectorization. if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I)); - PI && !PostProcessInstructions.contains(PI)) - Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + PI && !IsInPostProcessInstrs(PI)) + Changed |= vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R, TTI); } continue; } - // Ran into an instruction without users, like terminator, or function call - // with ignored return value, store. Ignore unused instructions (basing on - // instruction type, except for CallInst and InvokeInst). - if (it->use_empty() && - (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) { - KeyNodes.insert(&*it); + if (HasNoUsers(&*it)) { bool OpsChanged = false; auto *SI = dyn_cast<StoreInst>(it); bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; @@ -13746,16 +14916,16 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Postponed instructions should not be vectorized here, delay their // vectorization. if (auto *VI = dyn_cast<Instruction>(V); - VI && !PostProcessInstructions.contains(VI)) + VI && !IsInPostProcessInstrs(VI)) // Try to match and vectorize a horizontal reduction. - OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); + OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI); } } // Start vectorization of post-process list of instructions from the // top-tree instructions to try to vectorize as many instructions as // possible. - OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R, - it->isTerminator()); + OpsChanged |= + VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator()); if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. @@ -13766,8 +14936,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } } - if (isa<CmpInst, InsertElementInst, InsertValueInst>(it)) - PostProcessInstructions.insert(&*it); + if (isa<InsertElementInst, InsertValueInst>(it)) + PostProcessInserts.insert(&*it); + else if (isa<CmpInst>(it)) + PostProcessCmps.insert(cast<CmpInst>(&*it)); } return Changed; @@ -13928,10 +15100,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; - auto Limit = [&R, this](StoreInst *SI) { - unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); - return R.getMinVF(EltSize); - }; // Attempt to sort and vectorize each of the store-groups. for (auto &Pair : Stores) { @@ -13945,28 +15113,11 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { continue; Changed |= tryToVectorizeSequence<StoreInst>( - Pair.second, Limit, StoreSorter, AreCompatibleStores, + Pair.second, StoreSorter, AreCompatibleStores, [this, &R](ArrayRef<StoreInst *> Candidates, bool) { return vectorizeStores(Candidates, R); }, - /*LimitForRegisterSize=*/false); + /*MaxVFOnly=*/false, R); } return Changed; } - -char SLPVectorizer::ID = 0; - -static const char lv_name[] = "SLP Vectorizer"; - -INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) -INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) - -Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 733d2e1c667b..1271d1424c03 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -95,7 +95,7 @@ class VPRecipeBuilder { /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same /// decision from \p Range.Start to \p Range.End. VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range) const; + VFRange &Range, VPlanPtr &Plan); /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates @@ -136,11 +136,11 @@ public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. - VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); + VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); /// A helper function that computes the predicate of the edge between SRC /// and DST. - VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan); /// Mark given ingredient for recording its recipe once one is created for /// it. @@ -159,19 +159,11 @@ public: return Ingredient2Recipe[I]; } - /// Create a replicating region for \p PredRecipe. - VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, - VPlanPtr &Plan); - - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it - /// is predicated. \return \p VPBB augmented with this new recipe if \p I is - /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new - /// Region. Update the packing decision of predicated instructions if they - /// feed \p I. Range.End may be decreased to ensure same recipe behavior from - /// \p Range.Start to \p Range.End. - VPBasicBlock *handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - VPlanPtr &Plan); + /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as + /// last operand. Range.End may be decreased to ensure same recipe behavior + /// from \p Range.Start to \p Range.End. + VPRecipeOrVPValueTy handleReplication(Instruction *I, VFRange &Range, + VPlan &Plan); /// Add the incoming values from the backedge to reduction & first-order /// recurrence cross-iteration phis. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d554f438c804..e81b88fd8099 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" @@ -46,7 +47,10 @@ #include <vector> using namespace llvm; + +namespace llvm { extern cl::opt<bool> EnableVPlanNativePath; +} #define DEBUG_TYPE "vplan" @@ -160,8 +164,9 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() { } void VPBlockBase::setPlan(VPlan *ParentPlan) { - assert(ParentPlan->getEntry() == this && - "Can only set plan on its entry block."); + assert( + (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) && + "Can only set plan on its entry or preheader block."); Plan = ParentPlan; } @@ -209,7 +214,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { } Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { - if (!Def->hasDefiningRecipe()) + if (Def->isLiveIn()) return Def->getLiveInIRValue(); if (hasScalarValue(Def, Instance)) { @@ -243,11 +248,19 @@ void VPTransformState::addNewMetadata(Instruction *To, } void VPTransformState::addMetadata(Instruction *To, Instruction *From) { + // No source instruction to transfer metadata from? + if (!From) + return; + propagateMetadata(To, From); addNewMetadata(To, From); } void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { + // No source instruction to transfer metadata from? + if (!From) + return; + for (Value *V : To) { if (Instruction *I = dyn_cast<Instruction>(V)) addMetadata(I, From); @@ -265,7 +278,7 @@ void VPTransformState::setDebugLocFromInst(const Value *V) { // When a FSDiscriminator is enabled, we don't need to add the multiply // factors to the discriminators. if (DIL && Inst->getFunction()->shouldEmitDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + !Inst->isDebugOrPseudoInst() && !EnableFSDiscriminator) { // FIXME: For scalable vectors, assume vscale=1. auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); @@ -577,7 +590,9 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, #endif VPlan::~VPlan() { - clearLiveOuts(); + for (auto &KV : LiveOuts) + delete KV.second; + LiveOuts.clear(); if (Entry) { VPValue DummyValue; @@ -585,15 +600,23 @@ VPlan::~VPlan() { Block->dropAllReferences(&DummyValue); VPBlockBase::deleteCFG(Entry); + + Preheader->dropAllReferences(&DummyValue); + delete Preheader; } - for (VPValue *VPV : VPValuesToFree) + for (VPValue *VPV : VPLiveInsToFree) delete VPV; - if (TripCount) - delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; - for (auto &P : VPExternalDefs) - delete P.second; +} + +VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) { + VPBasicBlock *Preheader = new VPBasicBlock("ph"); + VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); + auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader); + Plan->TripCount = + vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE); + return Plan; } VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { @@ -609,13 +632,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, bool IsEpilogueVectorization) { - - // Check if the trip count is needed, and if so build it. - if (TripCount && TripCount->getNumUsers()) { - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(TripCount, TripCountV, Part); - } - // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); @@ -636,7 +652,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, // needs to be changed from zero to the value after the main vector loop. // FIXME: Improve modeling for canonical IV start values in the epilogue loop. if (CanonicalIVStartValue) { - VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue); + VPValue *VPV = getVPValueOrAddLiveIn(CanonicalIVStartValue); auto *IV = getCanonicalIV(); assert(all_of(IV->users(), [](const VPUser *U) { @@ -650,8 +666,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, VPInstruction::CanonicalIVIncrementNUW; }) && "the canonical IV should only be used by its increments or " - "ScalarIVSteps when " - "resetting the start value"); + "ScalarIVSteps when resetting the start value"); IV->setOperand(0, VPV); } } @@ -748,13 +763,25 @@ void VPlan::print(raw_ostream &O) const { if (VectorTripCount.getNumUsers() > 0) { O << "\nLive-in "; VectorTripCount.printAsOperand(O, SlotTracker); - O << " = vector-trip-count\n"; + O << " = vector-trip-count"; } if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { O << "\nLive-in "; BackedgeTakenCount->printAsOperand(O, SlotTracker); - O << " = backedge-taken count\n"; + O << " = backedge-taken count"; + } + + O << "\n"; + if (TripCount->isLiveIn()) + O << "Live-in "; + TripCount->printAsOperand(O, SlotTracker); + O << " = original trip-count"; + O << "\n"; + + if (!getPreheader()->empty()) { + O << "\n"; + getPreheader()->print(O, "", SlotTracker); } for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) { @@ -765,11 +792,7 @@ void VPlan::print(raw_ostream &O) const { if (!LiveOuts.empty()) O << "\n"; for (const auto &KV : LiveOuts) { - O << "Live-out "; - KV.second->getPhi()->printAsOperand(O); - O << " = "; - KV.second->getOperand(0)->printAsOperand(O, SlotTracker); - O << "\n"; + KV.second->print(O, SlotTracker); } O << "}\n"; @@ -882,6 +905,8 @@ void VPlanPrinter::dump() { OS << "edge [fontname=Courier, fontsize=30]\n"; OS << "compound=true\n"; + dumpBlock(Plan.getPreheader()); + for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry())) dumpBlock(Block); @@ -1086,26 +1111,27 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, } void VPSlotTracker::assignSlot(const VPValue *V) { - assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); + assert(!Slots.contains(V) && "VPValue already has a slot!"); Slots[V] = NextSlot++; } void VPSlotTracker::assignSlots(const VPlan &Plan) { - - for (const auto &P : Plan.VPExternalDefs) - assignSlot(P.second); - assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); + assignSlots(Plan.getPreheader()); ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>> RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); for (const VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT)) - for (const VPRecipeBase &Recipe : *VPBB) - for (VPValue *Def : Recipe.definedValues()) - assignSlot(Def); + assignSlots(VPBB); +} + +void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { + for (const VPRecipeBase &Recipe : *VPBB) + for (VPValue *Def : Recipe.definedValues()) + assignSlot(Def); } bool vputils::onlyFirstLaneUsed(VPValue *Def) { @@ -1115,13 +1141,17 @@ bool vputils::onlyFirstLaneUsed(VPValue *Def) { VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE) { + if (auto *Expanded = Plan.getSCEVExpansion(Expr)) + return Expanded; + VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - return Plan.getOrAddExternalDef(E->getValue()); - if (auto *E = dyn_cast<SCEVUnknown>(Expr)) - return Plan.getOrAddExternalDef(E->getValue()); - - VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); - VPExpandSCEVRecipe *Step = new VPExpandSCEVRecipe(Expr, SE); - Preheader->appendRecipe(Step); - return Step; + Expanded = Plan.getVPValueOrAddLiveIn(E->getValue()); + else if (auto *E = dyn_cast<SCEVUnknown>(Expr)) + Expanded = Plan.getVPValueOrAddLiveIn(E->getValue()); + else { + Expanded = new VPExpandSCEVRecipe(Expr, SE); + Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe()); + } + Plan.addSCEVExpansion(Expr, Expanded); + return Expanded; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 986faaf99664..73313465adea 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -25,7 +25,6 @@ #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -33,11 +32,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" -#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/IR/Operator.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -47,11 +47,9 @@ namespace llvm { class BasicBlock; class DominatorTree; -class InductionDescriptor; class InnerLoopVectorizer; class IRBuilderBase; class LoopInfo; -class PredicateScalarEvolution; class raw_ostream; class RecurrenceDescriptor; class SCEV; @@ -62,6 +60,7 @@ class VPlan; class VPReplicateRecipe; class VPlanSlp; class Value; +class LoopVersioning; namespace Intrinsic { typedef unsigned ID; @@ -76,16 +75,17 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE); +const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, + Loop *CurLoop = nullptr); /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: -/// [1, 9) = {1, 2, 4, 8} +/// [1, 16) = {1, 2, 4, 8} struct VFRange { // A power of 2. const ElementCount Start; - // Need not be a power of 2. If End <= Start range is empty. + // A power of 2. If End <= Start range is empty. ElementCount End; bool isEmpty() const { @@ -98,6 +98,33 @@ struct VFRange { "Both Start and End should have the same scalable flag"); assert(isPowerOf2_32(Start.getKnownMinValue()) && "Expected Start to be a power of 2"); + assert(isPowerOf2_32(End.getKnownMinValue()) && + "Expected End to be a power of 2"); + } + + /// Iterator to iterate over vectorization factors in a VFRange. + class iterator + : public iterator_facade_base<iterator, std::forward_iterator_tag, + ElementCount> { + ElementCount VF; + + public: + iterator(ElementCount VF) : VF(VF) {} + + bool operator==(const iterator &Other) const { return VF == Other.VF; } + + ElementCount operator*() const { return VF; } + + iterator &operator++() { + VF *= 2; + return *this; + } + }; + + iterator begin() { return iterator(Start); } + iterator end() { + assert(isPowerOf2_32(End.getKnownMinValue())); + return iterator(End); } }; @@ -248,7 +275,7 @@ struct VPTransformState { } bool hasAnyVectorValue(VPValue *Def) const { - return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end(); + return Data.PerPartOutput.contains(Def); } bool hasScalarValue(VPValue *Def, VPIteration Instance) { @@ -370,10 +397,6 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; - /// Holds recipes that may generate a poison value that is used after - /// vectorization, even when their operands are not poison. - SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes; - /// The loop object for the current parent region, or nullptr. Loop *CurrentVectorLoop = nullptr; @@ -382,7 +405,11 @@ struct VPTransformState { /// /// This is currently only used to add no-alias metadata based on the /// memchecks. The actually versioning is performed manually. - std::unique_ptr<LoopVersioning> LVer; + LoopVersioning *LVer = nullptr; + + /// Map SCEVs to their expanded values. Populated when executing + /// VPExpandSCEVRecipes. + DenseMap<const SCEV *, Value *> ExpandedSCEVs; }; /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. @@ -639,6 +666,10 @@ public: VPLiveOut(PHINode *Phi, VPValue *Op) : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} + static inline bool classof(const VPUser *U) { + return U->getVPUserID() == VPUser::VPUserID::LiveOut; + } + /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply /// means we need to add the appropriate incoming value from the middle /// block as exiting edges from the scalar epilogue loop (if present) are @@ -654,6 +685,11 @@ public: } PHINode *getPhi() const { return Phi; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the VPLiveOut to \p O. + void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; +#endif }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -790,6 +826,7 @@ public: SLPLoad, SLPStore, ActiveLaneMask, + CalculateTripCountMinusVF, CanonicalIVIncrement, CanonicalIVIncrementNUW, // The next two are similar to the above, but instead increment the @@ -810,8 +847,10 @@ private: const std::string Name; /// Utility method serving execute(): generates a single instance of the - /// modeled instruction. - void generateInstruction(VPTransformState &State, unsigned Part); + /// modeled instruction. \returns the generated value for \p Part. + /// In some cases an existing value is returned rather than a generated + /// one. + Value *generateInstruction(VPTransformState &State, unsigned Part); protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } @@ -892,6 +931,7 @@ public: default: return false; case VPInstruction::ActiveLaneMask: + case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: case VPInstruction::CanonicalIVIncrementForPart: @@ -903,14 +943,169 @@ public: } }; +/// Class to record LLVM IR flag for a recipe along with it. +class VPRecipeWithIRFlags : public VPRecipeBase { + enum class OperationType : unsigned char { + OverflowingBinOp, + PossiblyExactOp, + GEPOp, + FPMathOp, + Other + }; + struct WrapFlagsTy { + char HasNUW : 1; + char HasNSW : 1; + }; + struct ExactFlagsTy { + char IsExact : 1; + }; + struct GEPFlagsTy { + char IsInBounds : 1; + }; + struct FastMathFlagsTy { + char AllowReassoc : 1; + char NoNaNs : 1; + char NoInfs : 1; + char NoSignedZeros : 1; + char AllowReciprocal : 1; + char AllowContract : 1; + char ApproxFunc : 1; + }; + + OperationType OpType; + + union { + WrapFlagsTy WrapFlags; + ExactFlagsTy ExactFlags; + GEPFlagsTy GEPFlags; + FastMathFlagsTy FMFs; + unsigned char AllFlags; + }; + +public: + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands) + : VPRecipeBase(SC, Operands) { + OpType = OperationType::Other; + AllFlags = 0; + } + + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands, + Instruction &I) + : VPRecipeWithIRFlags(SC, Operands) { + if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) { + OpType = OperationType::OverflowingBinOp; + WrapFlags.HasNUW = Op->hasNoUnsignedWrap(); + WrapFlags.HasNSW = Op->hasNoSignedWrap(); + } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) { + OpType = OperationType::PossiblyExactOp; + ExactFlags.IsExact = Op->isExact(); + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + OpType = OperationType::GEPOp; + GEPFlags.IsInBounds = GEP->isInBounds(); + } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) { + OpType = OperationType::FPMathOp; + FastMathFlags FMF = Op->getFastMathFlags(); + FMFs.AllowReassoc = FMF.allowReassoc(); + FMFs.NoNaNs = FMF.noNaNs(); + FMFs.NoInfs = FMF.noInfs(); + FMFs.NoSignedZeros = FMF.noSignedZeros(); + FMFs.AllowReciprocal = FMF.allowReciprocal(); + FMFs.AllowContract = FMF.allowContract(); + FMFs.ApproxFunc = FMF.approxFunc(); + } + } + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenSC || + R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || + R->getVPDefID() == VPRecipeBase::VPReplicateSC; + } + + /// Drop all poison-generating flags. + void dropPoisonGeneratingFlags() { + // NOTE: This needs to be kept in-sync with + // Instruction::dropPoisonGeneratingFlags. + switch (OpType) { + case OperationType::OverflowingBinOp: + WrapFlags.HasNUW = false; + WrapFlags.HasNSW = false; + break; + case OperationType::PossiblyExactOp: + ExactFlags.IsExact = false; + break; + case OperationType::GEPOp: + GEPFlags.IsInBounds = false; + break; + case OperationType::FPMathOp: + FMFs.NoNaNs = false; + FMFs.NoInfs = false; + break; + case OperationType::Other: + break; + } + } + + /// Set the IR flags for \p I. + void setFlags(Instruction *I) const { + switch (OpType) { + case OperationType::OverflowingBinOp: + I->setHasNoUnsignedWrap(WrapFlags.HasNUW); + I->setHasNoSignedWrap(WrapFlags.HasNSW); + break; + case OperationType::PossiblyExactOp: + I->setIsExact(ExactFlags.IsExact); + break; + case OperationType::GEPOp: + cast<GetElementPtrInst>(I)->setIsInBounds(GEPFlags.IsInBounds); + break; + case OperationType::FPMathOp: + I->setHasAllowReassoc(FMFs.AllowReassoc); + I->setHasNoNaNs(FMFs.NoNaNs); + I->setHasNoInfs(FMFs.NoInfs); + I->setHasNoSignedZeros(FMFs.NoSignedZeros); + I->setHasAllowReciprocal(FMFs.AllowReciprocal); + I->setHasAllowContract(FMFs.AllowContract); + I->setHasApproxFunc(FMFs.ApproxFunc); + break; + case OperationType::Other: + break; + } + } + + bool isInBounds() const { + assert(OpType == OperationType::GEPOp && + "recipe doesn't have inbounds flag"); + return GEPFlags.IsInBounds; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + FastMathFlags getFastMathFlags() const { + FastMathFlags Res; + Res.setAllowReassoc(FMFs.AllowReassoc); + Res.setNoNaNs(FMFs.NoNaNs); + Res.setNoInfs(FMFs.NoInfs); + Res.setNoSignedZeros(FMFs.NoSignedZeros); + Res.setAllowReciprocal(FMFs.AllowReciprocal); + Res.setAllowContract(FMFs.AllowContract); + Res.setApproxFunc(FMFs.ApproxFunc); + return Res; + } + + void printFlags(raw_ostream &O) const; +#endif +}; + /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase, public VPValue { +class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue { + public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeBase(VPDef::VPWidenSC, Operands), VPValue(this, &I) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I) {} ~VPWidenRecipe() override = default; @@ -926,18 +1121,62 @@ public: #endif }; +/// VPWidenCastRecipe is a recipe to create vector cast instructions. +class VPWidenCastRecipe : public VPRecipeBase, public VPValue { + /// Cast instruction opcode. + Instruction::CastOps Opcode; + + /// Result type for the cast. + Type *ResultTy; + +public: + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + CastInst *UI = nullptr) + : VPRecipeBase(VPDef::VPWidenCastSC, Op), VPValue(this, UI), + Opcode(Opcode), ResultTy(ResultTy) { + assert((!UI || UI->getOpcode() == Opcode) && + "opcode of underlying cast doesn't match"); + assert((!UI || UI->getType() == ResultTy) && + "result type of underlying cast doesn't match"); + } + + ~VPWidenCastRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPWidenCastSC) + + /// Produce widened copies of the cast. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + Instruction::CastOps getOpcode() const { return Opcode; } + + /// Returns the result type of the cast. + Type *getResultType() const { return ResultTy; } +}; + /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { /// ID of the vector intrinsic to call when widening the call. If set the /// Intrinsic::not_intrinsic, a library call will be used instead. Intrinsic::ID VectorIntrinsicID; + /// If this recipe represents a library call, Variant stores a pointer to + /// the chosen function. There is a 1:1 mapping between a given VF and the + /// chosen vectorized variant, so there will be a different vplan for each + /// VF with a valid variant. + Function *Variant; public: template <typename IterT> VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments, - Intrinsic::ID VectorIntrinsicID) + Intrinsic::ID VectorIntrinsicID, + Function *Variant = nullptr) : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I), - VectorIntrinsicID(VectorIntrinsicID) {} + VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {} ~VPWidenCallRecipe() override = default; @@ -954,17 +1193,10 @@ public: }; /// A recipe for widening select instructions. -class VPWidenSelectRecipe : public VPRecipeBase, public VPValue { - - /// Is the condition of the select loop invariant? - bool InvariantCond; - -public: +struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue { template <typename IterT> - VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, - bool InvariantCond) - : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I), - InvariantCond(InvariantCond) {} + VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands) + : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I) {} ~VPWidenSelectRecipe() override = default; @@ -978,29 +1210,38 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + VPValue *getCond() const { + return getOperand(0); + } + + bool isInvariantCond() const { + return getCond()->isDefinedOutsideVectorRegions(); + } }; /// A recipe for handling GEP instructions. -class VPWidenGEPRecipe : public VPRecipeBase, public VPValue { - bool IsPtrLoopInvariant; - SmallBitVector IsIndexLoopInvariant; +class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue { + bool isPointerLoopInvariant() const { + return getOperand(0)->isDefinedOutsideVectorRegions(); + } + + bool isIndexLoopInvariant(unsigned I) const { + return getOperand(I + 1)->isDefinedOutsideVectorRegions(); + } + + bool areAllOperandsInvariant() const { + return all_of(operands(), [](VPValue *Op) { + return Op->isDefinedOutsideVectorRegions(); + }); + } public: template <typename IterT> VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) - : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP), - IsIndexLoopInvariant(GEP->getNumIndices(), false) {} + : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP), + VPValue(this, GEP) {} - template <typename IterT> - VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, - Loop *OrigLoop) - : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP), - IsIndexLoopInvariant(GEP->getNumIndices(), false) { - IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); - for (auto Index : enumerate(GEP->indices())) - IsIndexLoopInvariant[Index.index()] = - OrigLoop->isLoopInvariant(Index.value().get()); - } ~VPWidenGEPRecipe() override = default; VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC) @@ -1015,78 +1256,6 @@ public: #endif }; -/// A recipe for handling phi nodes of integer and floating-point inductions, -/// producing their vector values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { - PHINode *IV; - const InductionDescriptor &IndDesc; - bool NeedsVectorIV; - -public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc, - bool NeedsVectorIV) - : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}), - VPValue(this, IV), IV(IV), IndDesc(IndDesc), - NeedsVectorIV(NeedsVectorIV) {} - - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc, - TruncInst *Trunc, bool NeedsVectorIV) - : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}), - VPValue(this, Trunc), IV(IV), IndDesc(IndDesc), - NeedsVectorIV(NeedsVectorIV) {} - - ~VPWidenIntOrFpInductionRecipe() override = default; - - VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) - - /// Generate the vectorized and scalarized versions of the phi node as - /// needed by their users. - void execute(VPTransformState &State) override; - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif - - /// Returns the start value of the induction. - VPValue *getStartValue() { return getOperand(0); } - const VPValue *getStartValue() const { return getOperand(0); } - - /// Returns the step value of the induction. - VPValue *getStepValue() { return getOperand(1); } - const VPValue *getStepValue() const { return getOperand(1); } - - /// Returns the first defined value as TruncInst, if it is one or nullptr - /// otherwise. - TruncInst *getTruncInst() { - return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); - } - const TruncInst *getTruncInst() const { - return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); - } - - PHINode *getPHINode() { return IV; } - - /// Returns the induction descriptor for the recipe. - const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } - - /// Returns true if the induction is canonical, i.e. starting at 0 and - /// incremented by UF * VF (= the original IV is incremented by 1). - bool isCanonical() const; - - /// Returns the scalar type of the induction. - const Type *getScalarType() const { - const TruncInst *TruncI = getTruncInst(); - return TruncI ? TruncI->getType() : IV->getType(); - } - - /// Returns true if a vector phi needs to be created for the induction. - bool needsVectorIV() const { return NeedsVectorIV; } -}; - /// A pure virtual base class for all recipes modeling header phis, including /// phis for first order recurrences, pointer inductions and reductions. The /// start value is the first operand of the recipe and the incoming value from @@ -1112,9 +1281,9 @@ public: /// per-lane based on the canonical induction. class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue { protected: - VPHeaderPHIRecipe(unsigned char VPDefID, PHINode *Phi, + VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, VPValue *Start = nullptr) - : VPRecipeBase(VPDefID, {}), VPValue(this, Phi) { + : VPRecipeBase(VPDefID, {}), VPValue(this, UnderlyingInstr) { if (Start) addOperand(Start); } @@ -1125,12 +1294,12 @@ public: /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC && - B->getVPDefID() <= VPDef::VPLastPHISC; + B->getVPDefID() <= VPDef::VPLastHeaderPHISC; } static inline bool classof(const VPValue *V) { auto *B = V->getDefiningRecipe(); return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && - B->getVPDefID() <= VPRecipeBase::VPLastPHISC; + B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC; } /// Generate the phi nodes. @@ -1154,17 +1323,92 @@ public: void setStartValue(VPValue *V) { setOperand(0, V); } /// Returns the incoming value from the loop backedge. - VPValue *getBackedgeValue() { + virtual VPValue *getBackedgeValue() { return getOperand(1); } /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. - VPRecipeBase &getBackedgeRecipe() { + virtual VPRecipeBase &getBackedgeRecipe() { return *getBackedgeValue()->getDefiningRecipe(); } }; +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their vector values. +class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { + PHINode *IV; + TruncInst *Trunc; + const InductionDescriptor &IndDesc; + +public: + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, + const InductionDescriptor &IndDesc) + : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), + Trunc(nullptr), IndDesc(IndDesc) { + addOperand(Step); + } + + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, + const InductionDescriptor &IndDesc, + TruncInst *Trunc) + : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), + IV(IV), Trunc(Trunc), IndDesc(IndDesc) { + addOperand(Step); + } + + ~VPWidenIntOrFpInductionRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) + + /// Generate the vectorized and scalarized versions of the phi node as + /// needed by their users. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + VPValue *getBackedgeValue() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } + + VPRecipeBase &getBackedgeRecipe() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } + + /// Returns the step value of the induction. + VPValue *getStepValue() { return getOperand(1); } + const VPValue *getStepValue() const { return getOperand(1); } + + /// Returns the first defined value as TruncInst, if it is one or nullptr + /// otherwise. + TruncInst *getTruncInst() { return Trunc; } + const TruncInst *getTruncInst() const { return Trunc; } + + PHINode *getPHINode() { return IV; } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } + + /// Returns true if the induction is canonical, i.e. starting at 0 and + /// incremented by UF * VF (= the original IV is incremented by 1). + bool isCanonical() const; + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + return Trunc ? Trunc->getType() : IV->getType(); + } +}; + class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { const InductionDescriptor &IndDesc; @@ -1374,12 +1618,20 @@ public: class VPInterleaveRecipe : public VPRecipeBase { const InterleaveGroup<Instruction> *IG; + /// Indicates if the interleave group is in a conditional block and requires a + /// mask. bool HasMask = false; + /// Indicates if gaps between members of the group need to be masked out or if + /// unusued gaps can be loaded speculatively. + bool NeedsMaskForGaps = false; + public: VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, VPValue *Mask) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG) { + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps) + : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) { if (I->getType()->isVoidTy()) @@ -1490,28 +1742,21 @@ public: /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase, public VPValue { +class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue { /// Indicator if only a single replica per lane is needed. bool IsUniform; /// Indicator if the replicas are also predicated. bool IsPredicated; - /// Indicator if the scalar values should also be packed into a vector. - bool AlsoPack; - public: template <typename IterT> VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, - bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPDef::VPReplicateSC, Operands), VPValue(this, I), - IsUniform(IsUniform), IsPredicated(IsPredicated) { - // Retain the previous behavior of predicateInstructions(), where an - // insert-element of a predicated instruction got hoisted into the - // predicated basic block iff it was its only user. This is achieved by - // having predicated instructions also pack their values into a vector by - // default unless they have a replicated user which uses their scalar value. - AlsoPack = IsPredicated && !I->use_empty(); + bool IsUniform, VPValue *Mask = nullptr) + : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), + VPValue(this, I), IsUniform(IsUniform), IsPredicated(Mask) { + if (Mask) + addOperand(Mask); } ~VPReplicateRecipe() override = default; @@ -1523,8 +1768,6 @@ public: /// the \p State. void execute(VPTransformState &State) override; - void setAlsoPack(bool Pack) { AlsoPack = Pack; } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1533,8 +1776,6 @@ public: bool isUniform() const { return IsUniform; } - bool isPacked() const { return AlsoPack; } - bool isPredicated() const { return IsPredicated; } /// Returns true if the recipe only uses the first lane of operand \p Op. @@ -1550,6 +1791,17 @@ public: "Op must be an operand of the recipe"); return true; } + + /// Returns true if the recipe is used by a widened recipe via an intervening + /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed + /// in a vector. + bool shouldPack() const; + + /// Return the mask of a predicated VPReplicateRecipe. + VPValue *getMask() { + assert(isPredicated() && "Trying to get the mask of a unpredicated recipe"); + return getOperand(getNumOperands() - 1); + } }; /// A recipe for generating conditional branches on the bits of a mask. @@ -1791,9 +2043,11 @@ public: return true; } - /// Check if the induction described by \p ID is canonical, i.e. has the same - /// start, step (of 1), and type as the canonical IV. - bool isCanonical(const InductionDescriptor &ID, Type *Ty) const; + /// Check if the induction described by \p Kind, /p Start and \p Step is + /// canonical, i.e. has the same start, step (of 1), and type as the + /// canonical IV. + bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start, + VPValue *Step, Type *Ty) const; }; /// A recipe for generating the active lane mask for the vector loop that is @@ -2156,13 +2410,19 @@ public: /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry -/// VPBlock. +/// VPBasicBlock. class VPlan { friend class VPlanPrinter; friend class VPSlotTracker; - /// Hold the single entry to the Hierarchical CFG of the VPlan. - VPBlockBase *Entry; + /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the + /// preheader of the vector loop. + VPBasicBlock *Entry; + + /// VPBasicBlock corresponding to the original preheader. Used to place + /// VPExpandSCEV recipes for expressions used during skeleton creation and the + /// rest of VPlan execution. + VPBasicBlock *Preheader; /// Holds the VFs applicable to this VPlan. SmallSetVector<ElementCount, 2> VFs; @@ -2174,10 +2434,6 @@ class VPlan { /// Holds the name of the VPlan, for printing. std::string Name; - /// Holds all the external definitions created for this VPlan. External - /// definitions must be immutable and hold a pointer to their underlying IR. - DenseMap<Value *, VPValue *> VPExternalDefs; - /// Represents the trip count of the original loop, for folding /// the tail. VPValue *TripCount = nullptr; @@ -2193,9 +2449,9 @@ class VPlan { /// VPlan. Value2VPValueTy Value2VPValue; - /// Contains all VPValues that been allocated by addVPValue directly and need - /// to be free when the plan's destructor is called. - SmallVector<VPValue *, 16> VPValuesToFree; + /// Contains all the external definitions created for this VPlan. External + /// definitions are VPValues that hold a pointer to their underlying IR. + SmallVector<VPValue *, 16> VPLiveInsToFree; /// Indicates whether it is safe use the Value2VPValue mapping or if the /// mapping cannot be used any longer, because it is stale. @@ -2204,14 +2460,41 @@ class VPlan { /// Values used outside the plan. MapVector<PHINode *, VPLiveOut *> LiveOuts; + /// Mapping from SCEVs to the VPValues representing their expansions. + /// NOTE: This mapping is temporary and will be removed once all users have + /// been modeled in VPlan directly. + DenseMap<const SCEV *, VPValue *> SCEVToExpansion; + public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { - if (Entry) - Entry->setPlan(this); + /// Construct a VPlan with original preheader \p Preheader, trip count \p TC + /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to + /// be disconnected, as the bypass blocks between them are not yet modeled in + /// VPlan. + VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry) + : VPlan(Preheader, Entry) { + TripCount = TC; + } + + /// Construct a VPlan with original preheader \p Preheader and \p Entry to + /// the plan. At the moment, \p Preheader and \p Entry need to be + /// disconnected, as the bypass blocks between them are not yet modeled in + /// VPlan. + VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry) + : Entry(Entry), Preheader(Preheader) { + Entry->setPlan(this); + Preheader->setPlan(this); + assert(Preheader->getNumSuccessors() == 0 && + Preheader->getNumPredecessors() == 0 && + "preheader must be disconnected"); } ~VPlan(); + /// Create an initial VPlan with preheader and entry blocks. Creates a + /// VPExpandSCEVRecipe for \p TripCount and uses it as plan's trip count. + static VPlanPtr createInitialVPlan(const SCEV *TripCount, + ScalarEvolution &PSE); + /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State, @@ -2220,19 +2503,12 @@ public: /// Generate the IR code for this VPlan. void execute(VPTransformState *State); - VPBlockBase *getEntry() { return Entry; } - const VPBlockBase *getEntry() const { return Entry; } - - VPBlockBase *setEntry(VPBlockBase *Block) { - Entry = Block; - Block->setPlan(this); - return Entry; - } + VPBasicBlock *getEntry() { return Entry; } + const VPBasicBlock *getEntry() const { return Entry; } /// The trip count of the original loop. - VPValue *getOrCreateTripCount() { - if (!TripCount) - TripCount = new VPValue(); + VPValue *getTripCount() const { + assert(TripCount && "trip count needs to be set before accessing it"); return TripCount; } @@ -2275,50 +2551,35 @@ public: void setName(const Twine &newName) { Name = newName.str(); } - /// Get the existing or add a new external definition for \p V. - VPValue *getOrAddExternalDef(Value *V) { - auto I = VPExternalDefs.insert({V, nullptr}); - if (I.second) - I.first->second = new VPValue(V); - return I.first->second; - } - - void addVPValue(Value *V) { - assert(Value2VPValueEnabled && - "IR value to VPValue mapping may be out of date!"); - assert(V && "Trying to add a null Value to VPlan"); - assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - VPValue *VPV = new VPValue(V); - Value2VPValue[V] = VPV; - VPValuesToFree.push_back(VPV); - } - void addVPValue(Value *V, VPValue *VPV) { - assert(Value2VPValueEnabled && "Value2VPValue mapping may be out of date!"); + assert((Value2VPValueEnabled || VPV->isLiveIn()) && + "Value2VPValue mapping may be out of date!"); assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); Value2VPValue[V] = VPV; } /// Returns the VPValue for \p V. \p OverrideAllowed can be used to disable - /// checking whether it is safe to query VPValues using IR Values. + /// /// checking whether it is safe to query VPValues using IR Values. VPValue *getVPValue(Value *V, bool OverrideAllowed = false) { - assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && - "Value2VPValue mapping may be out of date!"); assert(V && "Trying to get the VPValue of a null Value"); assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); + assert((Value2VPValueEnabled || OverrideAllowed || + Value2VPValue[V]->isLiveIn()) && + "Value2VPValue mapping may be out of date!"); return Value2VPValue[V]; } - /// Gets the VPValue or adds a new one (if none exists yet) for \p V. \p - /// OverrideAllowed can be used to disable checking whether it is safe to - /// query VPValues using IR Values. - VPValue *getOrAddVPValue(Value *V, bool OverrideAllowed = false) { - assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && - "Value2VPValue mapping may be out of date!"); + /// Gets the VPValue for \p V or adds a new live-in (if none exists yet) for + /// \p V. + VPValue *getVPValueOrAddLiveIn(Value *V) { assert(V && "Trying to get or add the VPValue of a null Value"); - if (!Value2VPValue.count(V)) - addVPValue(V); + if (!Value2VPValue.count(V)) { + VPValue *VPV = new VPValue(V); + VPLiveInsToFree.push_back(VPV); + addVPValue(V, VPV); + } + return getVPValue(V); } @@ -2344,7 +2605,7 @@ public: iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> mapToVPValues(User::op_range Operands) { std::function<VPValue *(Value *)> Fn = [this](Value *Op) { - return getOrAddVPValue(Op); + return getVPValueOrAddLiveIn(Op); }; return map_range(Operands, Fn); } @@ -2373,12 +2634,6 @@ public: void addLiveOut(PHINode *PN, VPValue *V); - void clearLiveOuts() { - for (auto &KV : LiveOuts) - delete KV.second; - LiveOuts.clear(); - } - void removeLiveOut(PHINode *PN) { delete LiveOuts[PN]; LiveOuts.erase(PN); @@ -2388,6 +2643,19 @@ public: return LiveOuts; } + VPValue *getSCEVExpansion(const SCEV *S) const { + return SCEVToExpansion.lookup(S); + } + + void addSCEVExpansion(const SCEV *S, VPValue *V) { + assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); + SCEVToExpansion[S] = V; + } + + /// \return The block corresponding to the original preheader. + VPBasicBlock *getPreheader() { return Preheader; } + const VPBasicBlock *getPreheader() const { return Preheader; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2709,6 +2977,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { assert(Def && "Must have definition for value defined inside vector region"); if (auto Rep = dyn_cast<VPReplicateRecipe>(Def)) return Rep->isUniform(); + if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def)) + return all_of(GEP->operands(), isUniformAfterVectorization); return false; } } // end namespace vputils diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h index f790f7e73e11..89e2e7514dac 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h +++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h @@ -13,6 +13,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H #include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 952ce72e36c1..f6e3a2a16db8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -73,9 +73,8 @@ public: PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) : TheLoop(Lp), LI(LI), Plan(P) {} - /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected - /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG. - VPBasicBlock *buildPlainCFG(); + /// Build plain CFG for TheLoop and connects it to Plan's entry. + void buildPlainCFG(); }; } // anonymous namespace @@ -196,7 +195,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { // A and B: Create VPValue and add it to the pool of external definitions and // to the Value->VPValue map. - VPValue *NewVPVal = Plan.getOrAddExternalDef(IRVal); + VPValue *NewVPVal = Plan.getVPValueOrAddLiveIn(IRVal); IRDef2VPValue[IRVal] = NewVPVal; return NewVPVal; } @@ -254,7 +253,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } // Main interface to build the plain CFG. -VPBasicBlock *PlainCFGBuilder::buildPlainCFG() { +void PlainCFGBuilder::buildPlainCFG() { // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for // each BB and link it to its successor and predecessor VPBBs. Note that @@ -267,12 +266,13 @@ VPBasicBlock *PlainCFGBuilder::buildPlainCFG() { BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader(); assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) && "Unexpected loop preheader"); - VPBasicBlock *ThePreheaderVPBB = getOrCreateVPBB(ThePreheaderBB); + VPBasicBlock *ThePreheaderVPBB = Plan.getEntry(); + BB2VPBB[ThePreheaderBB] = ThePreheaderVPBB; ThePreheaderVPBB->setName("vector.ph"); for (auto &I : *ThePreheaderBB) { if (I.getType()->isVoidTy()) continue; - IRDef2VPValue[&I] = Plan.getOrAddExternalDef(&I); + IRDef2VPValue[&I] = Plan.getVPValueOrAddLiveIn(&I); } // Create empty VPBB for Loop H so that we can link PH->H. VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader()); @@ -371,20 +371,17 @@ VPBasicBlock *PlainCFGBuilder::buildPlainCFG() { // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding // VPlan operands. fixPhiNodes(); - - return ThePreheaderVPBB; } -VPBasicBlock *VPlanHCFGBuilder::buildPlainCFG() { +void VPlanHCFGBuilder::buildPlainCFG() { PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); - return PCFGBuilder.buildPlainCFG(); + PCFGBuilder.buildPlainCFG(); } // Public interface to build a H-CFG. void VPlanHCFGBuilder::buildHierarchicalCFG() { - // Build Top Region enclosing the plain CFG and set it as VPlan entry. - VPBasicBlock *EntryVPBB = buildPlainCFG(); - Plan.setEntry(EntryVPBB); + // Build Top Region enclosing the plain CFG. + buildPlainCFG(); LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 2d52990af268..299ae36155cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -57,9 +57,8 @@ private: // are introduced. VPDominatorTree VPDomTree; - /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected - /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG. - VPBasicBlock *buildPlainCFG(); + /// Build plain CFG for TheLoop and connects it to Plan's entry. + void buildPlainCFG(); public: VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4e9be35001ad..26c309eed800 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -34,7 +34,9 @@ using namespace llvm; using VectorParts = SmallVector<Value *, 2>; +namespace llvm { extern cl::opt<bool> EnableVPlanNativePath; +} #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -50,14 +52,16 @@ bool VPRecipeBase::mayWriteToMemory() const { ->mayWriteToMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: + case VPPredInstPHISC: return false; - case VPWidenIntOrFpInductionSC: + case VPBlendSC: + case VPReductionSC: case VPWidenCanonicalIVSC: + case VPWidenCastSC: + case VPWidenGEPSC: + case VPWidenIntOrFpInductionSC: case VPWidenPHISC: - case VPBlendSC: case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); @@ -82,14 +86,16 @@ bool VPRecipeBase::mayReadFromMemory() const { ->mayReadFromMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: + case VPPredInstPHISC: return false; - case VPWidenIntOrFpInductionSC: + case VPBlendSC: + case VPReductionSC: case VPWidenCanonicalIVSC: + case VPWidenCastSC: + case VPWidenGEPSC: + case VPWidenIntOrFpInductionSC: case VPWidenPHISC: - case VPBlendSC: case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); @@ -108,16 +114,20 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPPredInstPHISC: return false; - case VPWidenIntOrFpInductionSC: - case VPWidenPointerInductionSC: + case VPWidenCallSC: + return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) + ->mayHaveSideEffects(); + case VPBlendSC: + case VPReductionSC: + case VPScalarIVStepsSC: case VPWidenCanonicalIVSC: + case VPWidenCastSC: + case VPWidenGEPSC: + case VPWidenIntOrFpInductionSC: case VPWidenPHISC: - case VPBlendSC: + case VPWidenPointerInductionSC: case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: - case VPScalarIVStepsSC: { + case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); (void)I; @@ -125,6 +135,13 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPWidenMemoryInstructionSC: + assert(cast<VPWidenMemoryInstructionRecipe>(this) + ->getIngredient() + .mayHaveSideEffects() == mayWriteToMemory() && + "mayHaveSideffects result for ingredient differs from this " + "implementation"); + return mayWriteToMemory(); case VPReplicateSC: { auto *R = cast<VPReplicateRecipe>(this); return R->getUnderlyingInstr()->mayHaveSideEffects(); @@ -143,6 +160,16 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { State.Builder.GetInsertBlock()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { + O << "Live-out "; + getPhi()->printAsOperand(O); + O << " = "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << "\n"; +} +#endif + void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -189,55 +216,44 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, insertBefore(BB, I); } -void VPInstruction::generateInstruction(VPTransformState &State, - unsigned Part) { +Value *VPInstruction::generateInstruction(VPTransformState &State, + unsigned Part) { IRBuilderBase &Builder = State.Builder; Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - Value *V = - Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); - State.set(this, V, Part); - return; + return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); } switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A, Name); - State.set(this, V, Part); - break; + return Builder.CreateNot(A, Name); } case VPInstruction::ICmpULE: { Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC, Name); - State.set(this, V, Part); - break; + return Builder.CreateICmpULE(IV, TC, Name); } case Instruction::Select: { Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); - State.set(this, V, Part); - break; + return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), Part); + Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = VectorType::get(Int1Ty, State.VF); - Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, Name); - State.set(this, Call, Part); - break; + return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, Name); } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. @@ -255,18 +271,22 @@ void VPInstruction::generateInstruction(VPTransformState &State, // For the first part, use the recurrence phi (v1), otherwise v2. auto *V1 = State.get(getOperand(0), 0); Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); - if (!PartMinus1->getType()->isVectorTy()) { - State.set(this, PartMinus1, Part); - } else { - Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), - Part); - } - break; + if (!PartMinus1->getType()->isVectorTy()) + return PartMinus1; + Value *V2 = State.get(getOperand(1), Part); + return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); + } + case VPInstruction::CalculateTripCountMinusVF: { + Value *ScalarTC = State.get(getOperand(0), {0, 0}); + Value *Step = + createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); + Value *Sub = Builder.CreateSub(ScalarTC, Step); + Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); + Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); + return Builder.CreateSelect(Cmp, Sub, Zero); } case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: { - Value *Next = nullptr; if (Part == 0) { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; auto *Phi = State.get(getOperand(0), 0); @@ -274,34 +294,26 @@ void VPInstruction::generateInstruction(VPTransformState &State, // elements) times the unroll factor (num of SIMD instructions). Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); - } else { - Next = State.get(this, 0); + return Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } - - State.set(this, Next, Part); - break; + return State.get(this, 0); } case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::CanonicalIVIncrementForPartNUW: { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; auto *IV = State.get(getOperand(0), VPIteration(0, 0)); - if (Part == 0) { - State.set(this, IV, Part); - break; - } + if (Part == 0) + return IV; // The canonical IV is incremented by the vectorization factor (num of SIMD // elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); - Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); - State.set(this, Next, Part); - break; + return Builder.CreateAdd(IV, Step, Name, IsNUW, false); } case VPInstruction::BranchOnCond: { if (Part != 0) - break; + return nullptr; Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); VPRegionBlock *ParentRegion = getParent()->getParent(); @@ -318,11 +330,11 @@ void VPInstruction::generateInstruction(VPTransformState &State, CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - break; + return CondBr; } case VPInstruction::BranchOnCount: { if (Part != 0) - break; + return nullptr; // First create the compare. Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); @@ -342,7 +354,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - break; + return CondBr; } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -353,8 +365,13 @@ void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(FMF); - for (unsigned Part = 0; Part < State.UF; ++Part) - generateInstruction(State, Part); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *GeneratedValue = generateInstruction(State, Part); + if (!hasResult()) + continue; + assert(GeneratedValue && "generateInstruction must produce a value"); + State.set(this, GeneratedValue, Part); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -400,6 +417,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CalculateTripCountMinusVF: + O << "TC > VF ? TC - VF : 0"; + break; case VPInstruction::CanonicalIVIncrementForPart: O << "VF * Part + "; break; @@ -438,18 +458,19 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { } void VPWidenCallRecipe::execute(VPTransformState &State) { + assert(State.VF.isVector() && "not widening"); auto &CI = *cast<CallInst>(getUnderlyingInstr()); assert(!isa<DbgInfoIntrinsic>(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back( - ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue())); - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Type *, 2> TysForDecl = {CI.getType()}; + SmallVector<Type *, 2> TysForDecl; + // Add return type if intrinsic is overloaded on it. + if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) { + TysForDecl.push_back( + VectorType::get(CI.getType()->getScalarType(), State.VF)); + } SmallVector<Value *, 4> Args; for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a @@ -468,21 +489,16 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { Function *VectorF; if (VectorIntrinsicID != Intrinsic::not_intrinsic) { // Use vector version of the intrinsic. - if (State.VF.isVector()) - TysForDecl[0] = - VectorType::get(CI.getType()->getScalarType(), State.VF); Module *M = State.Builder.GetInsertBlock()->getModule(); VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { - // Use vector version of the function call. - const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/); #ifndef NDEBUG - assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && - "Can't create vector function."); + assert(Variant != nullptr && "Can't create vector function."); #endif - VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + VectorF = Variant; } + SmallVector<OperandBundleDef, 1> OpBundles; CI.getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); @@ -514,8 +530,12 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, if (VectorIntrinsicID) O << " (using vector intrinsic)"; - else - O << " (using library function)"; + else { + O << " (using library function"; + if (Variant->hasName()) + O << ": " << Variant->getName(); + O << ")"; + } } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, @@ -528,7 +548,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); - O << (InvariantCond ? " (condition is loop invariant)" : ""); + O << (isInvariantCond() ? " (condition is loop invariant)" : ""); } #endif @@ -541,10 +561,10 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. auto *InvarCond = - InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); Value *Op0 = State.get(getOperand(1), Part); Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); @@ -553,6 +573,33 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { + switch (OpType) { + case OperationType::PossiblyExactOp: + if (ExactFlags.IsExact) + O << " exact"; + break; + case OperationType::OverflowingBinOp: + if (WrapFlags.HasNUW) + O << " nuw"; + if (WrapFlags.HasNSW) + O << " nsw"; + break; + case OperationType::FPMathOp: + getFastMathFlags().print(O); + break; + case OperationType::GEPOp: + if (GEPFlags.IsInBounds) + O << " inbounds"; + break; + case OperationType::Other: + break; + } + O << " "; +} +#endif + void VPWidenRecipe::execute(VPTransformState &State) { auto &I = *cast<Instruction>(getUnderlyingValue()); auto &Builder = State.Builder; @@ -592,17 +639,8 @@ void VPWidenRecipe::execute(VPTransformState &State) { Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - if (auto *VecOp = dyn_cast<Instruction>(V)) { - VecOp->copyIRFlags(&I); - - // If the instruction is vectorized and was in a basic block that needed - // predication, we can't propagate poison-generating flags (nuw/nsw, - // exact, etc.). The control flow has been linearized and the - // instruction is no longer guarded by the predicate, which could make - // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.contains(this)) - VecOp->dropPoisonGeneratingFlags(); - } + if (auto *VecOp = dyn_cast<Instruction>(V)) + setFlags(VecOp); // Use this vector value for all users of the original instruction. State.set(this, V, Part); @@ -646,35 +684,6 @@ void VPWidenRecipe::execute(VPTransformState &State) { break; } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - State.setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = (State.VF.isScalar()) - ? CI->getType() - : VectorType::get(CI->getType(), State.VF); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, &I); - } - break; - } default: // This instruction is not vectorized by simple widening. LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); @@ -687,10 +696,39 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); const Instruction *UI = getUnderlyingInstr(); - O << " = " << UI->getOpcodeName() << " "; + O << " = " << UI->getOpcodeName(); + printFlags(O); if (auto *Cmp = dyn_cast<CmpInst>(UI)) - O << CmpInst::getPredicateName(Cmp->getPredicate()) << " "; + O << Cmp->getPredicate() << " "; + printOperands(O, SlotTracker); +} +#endif + +void VPWidenCastRecipe::execute(VPTransformState &State) { + auto *I = cast_or_null<Instruction>(getUnderlyingValue()); + if (I) + State.setDebugLocFromInst(I); + auto &Builder = State.Builder; + /// Vectorize casts. + assert(State.VF.isVector() && "Not vectorizing?"); + Type *DestTy = VectorType::get(getResultType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); + State.set(this, Cast, Part); + State.addMetadata(Cast, I); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-CAST "; + printAsOperand(O, SlotTracker); + O << " = " << Instruction::getOpcodeName(Opcode) << " "; printOperands(O, SlotTracker); + O << " to " << *getResultType(); } void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, @@ -710,8 +748,13 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + // The step may be defined by a recipe in the preheader (e.g. if it requires + // SCEV expansion), but for the canonical induction the step is required to be + // 1, which is represented as live-in. + if (getStepValue()->getDefiningRecipe()) + return false; + auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); return StartC && StartC->isZero() && StepC && StepC->isOne(); } @@ -743,6 +786,7 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenGEPRecipe::execute(VPTransformState &State) { + assert(State.VF.isVector() && "not widening"); auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -750,7 +794,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (areAllOperandsInvariant()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -763,9 +807,15 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // required. We would add the scalarization decision to // collectLoopScalars() and teach getVectorValue() to broadcast // the lane-zero scalar value. - auto *Clone = State.Builder.Insert(GEP->clone()); + SmallVector<Value *> Ops; + for (unsigned I = 0, E = getNumOperands(); I != E; I++) + Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); + + auto *NewGEP = + State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], + ArrayRef(Ops).drop_front(), "", isInBounds()); for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); State.set(this, EntryPart, Part); State.addMetadata(EntryPart, GEP); } @@ -780,7 +830,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant + auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPIteration(0, 0)) : State.get(getOperand(0), Part); @@ -789,24 +839,16 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { SmallVector<Value *, 4> Indices; for (unsigned I = 1, E = getNumOperands(); I < E; I++) { VPValue *Operand = getOperand(I); - if (IsIndexLoopInvariant[I - 1]) + if (isIndexLoopInvariant(I - 1)) Indices.push_back(State.get(Operand, VPIteration(0, 0))); else Indices.push_back(State.get(Operand, Part)); } - // If the GEP instruction is vectorized and was in a basic block that - // needed predication, we can't propagate the poison-generating 'inbounds' - // flag. The control flow has been linearized and the GEP is no longer - // guarded by the predicate, which could make the 'inbounds' properties to - // no longer hold. - bool IsInBounds = - GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", IsInBounds); + Indices, "", isInBounds()); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP, Part); @@ -819,14 +861,14 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; - O << (IsPtrLoopInvariant ? "Inv" : "Var"); - size_t IndicesNumber = IsIndexLoopInvariant.size(); - for (size_t I = 0; I < IndicesNumber; ++I) - O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + O << (isPointerLoopInvariant() ? "Inv" : "Var"); + for (size_t I = 0; I < getNumOperands() - 1; ++I) + O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; O << " "; printAsOperand(O, SlotTracker); - O << " = getelementptr "; + O << " = getelementptr"; + printFlags(O); printOperands(O, SlotTracker); } #endif @@ -911,7 +953,21 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " (with final reduction value stored in invariant address sank " "outside of loop)"; } +#endif + +bool VPReplicateRecipe::shouldPack() const { + // Find if the recipe is used by a widened recipe via an intervening + // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. + return any_of(users(), [](const VPUser *U) { + if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) + return any_of(PredR->users(), [PredR](const VPUser *U) { + return !U->usesScalars(PredR); + }); + return false; + }); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); @@ -921,18 +977,21 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; } if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { - O << "call @" << CB->getCalledFunction()->getName() << "("; + O << "call"; + printFlags(O); + O << "@" << CB->getCalledFunction()->getName() << "("; interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); }); O << ")"; } else { - O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; + O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); + printFlags(O); printOperands(O, SlotTracker); } - if (AlsoPack) + if (shouldPack()) O << " (S->V)"; } #endif @@ -1053,20 +1112,22 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -bool VPCanonicalIVPHIRecipe::isCanonical(const InductionDescriptor &ID, - Type *Ty) const { - if (Ty != getScalarType()) +bool VPCanonicalIVPHIRecipe::isCanonical( + InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, + Type *Ty) const { + // The types must match and it must be an integer induction. + if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction) return false; - // The start value of ID must match the start value of this canonical - // induction. - if (getStartValue()->getLiveInIRValue() != ID.getStartValue()) + // Start must match the start value of this canonical induction. + if (Start != getStartValue()) return false; - ConstantInt *Step = ID.getConstIntStepValue(); - // ID must also be incremented by one. IK_IntInduction always increment the - // induction by Step, but the binary op may not be set. - return ID.getKind() == InductionDescriptor::IK_IntInduction && Step && - Step->isOne(); + // If the step is defined by a recipe, it is not a ConstantInt. + if (Step->getDefiningRecipe()) + return false; + + ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); + return StepC && StepC->isOne(); } bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { @@ -1092,9 +1153,11 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) { Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), &*State.Builder.GetInsertPoint()); - + assert(!State.ExpandedSCEVs.contains(Expr) && + "Same SCEV expanded multiple times"); + State.ExpandedSCEVs[Expr] = Res; for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, Res, Part); + State.set(this, Res, {Part, 0}); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cbf111b00e3d..83bfdfd09d19 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" +#include "VPlanDominatorTree.h" +#include "VPRecipeBuilder.h" #include "VPlanCFG.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" @@ -22,11 +24,10 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, + VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, - SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE, - const TargetLibraryInfo &TLI) { + ScalarEvolution &SE, const TargetLibraryInfo &TLI) { ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( Plan->getEntry()); @@ -39,22 +40,15 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPValue *VPV = Ingredient.getVPSingleValue(); Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue()); - if (DeadInstructions.count(Inst)) { - VPValue DummyValue; - VPV->replaceAllUsesWith(&DummyValue); - Ingredient.eraseFromParent(); - continue; - } VPRecipeBase *NewRecipe = nullptr; if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { - VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); + VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = - new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -66,28 +60,25 @@ void VPlanTransforms::VPInstructionsToVPRecipes( // Create VPWidenMemoryInstructionRecipe for loads and stores. if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( - *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), - nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); + *Load, Ingredient.getOperand(0), nullptr /*Mask*/, + false /*Consecutive*/, false /*Reverse*/); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( - *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), - Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/); + *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), + nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { - NewRecipe = new VPWidenGEPRecipe( - GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); + NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { NewRecipe = - new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()), + new VPWidenCallRecipe(*CI, drop_end(Ingredient.operands()), getVectorIntrinsicIDForCall(CI, &TLI)); } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { - bool InvariantCond = - SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop); - NewRecipe = new VPWidenSelectRecipe( - *SI, Plan->mapToVPValues(SI->operands()), InvariantCond); + NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); + } else if (auto *CI = dyn_cast<CastInst>(Inst)) { + NewRecipe = new VPWidenCastRecipe( + CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI); } else { - NewRecipe = - new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); + NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); } } @@ -98,15 +89,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes( assert(NewRecipe->getNumDefinedValues() == 0 && "Only recpies with zero or one defined values expected"); Ingredient.eraseFromParent(); - Plan->removeVPValueFor(Inst); - for (auto *Def : NewRecipe->definedValues()) { - Plan->addVPValue(Inst, Def); - } } } } -bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { +static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); bool Changed = false; // First, collect the operands of all recipes in replicate blocks as seeds for @@ -167,8 +154,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { continue; Instruction *I = cast<Instruction>( cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue()); - auto *Clone = - new VPReplicateRecipe(I, SinkCandidate->operands(), true, false); + auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true); // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); @@ -224,7 +210,10 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { return nullptr; } -bool VPlanTransforms::mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { +// Merge replicate regions in their successor region, if a replicate region +// is connected to a successor replicate region with the same predicate by a +// single, empty VPBasicBlock. +static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { SetVector<VPRegionBlock *> DeletedRegions; // Collect replicate regions followed by an empty block, followed by another @@ -312,6 +301,81 @@ bool VPlanTransforms::mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { return !DeletedRegions.empty(); } +static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, + VPlan &Plan) { + Instruction *Instr = PredRecipe->getUnderlyingInstr(); + // Build the triangular if-then region. + std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); + assert(Instr->getParent() && "Predicated instruction not in any basic block"); + auto *BlockInMask = PredRecipe->getMask(); + auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); + auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + + // Replace predicated replicate recipe with a replicate recipe without a + // mask but in the replicate region. + auto *RecipeWithoutMask = new VPReplicateRecipe( + PredRecipe->getUnderlyingInstr(), + make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), + PredRecipe->isUniform()); + auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); + + VPPredInstPHIRecipe *PHIRecipe = nullptr; + if (PredRecipe->getNumUsers() != 0) { + PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); + PredRecipe->replaceAllUsesWith(PHIRecipe); + PHIRecipe->setOperand(0, RecipeWithoutMask); + } + PredRecipe->eraseFromParent(); + auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); + + // Note: first set Entry as region entry and then connect successors starting + // from it in order, to propagate the "parent" of each VPBasicBlock. + VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); + VPBlockUtils::connectBlocks(Pred, Exiting); + + return Region; +} + +static void addReplicateRegions(VPlan &Plan) { + SmallVector<VPReplicateRecipe *> WorkList; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { + if (RepR->isPredicated()) + WorkList.push_back(RepR); + } + } + + unsigned BBNum = 0; + for (VPReplicateRecipe *RepR : WorkList) { + VPBasicBlock *CurrentBlock = RepR->getParent(); + VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator()); + + BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); + SplitBlock->setName( + OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : ""); + // Record predicated instructions for above packing optimizations. + VPBlockBase *Region = createReplicateRegion(RepR, Plan); + Region->setParent(CurrentBlock->getParent()); + VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock); + VPBlockUtils::connectBlocks(CurrentBlock, Region); + VPBlockUtils::connectBlocks(Region, SplitBlock); + } +} + +void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { + // Convert masked VPReplicateRecipes to if-then region blocks. + addReplicateRegions(Plan); + + bool ShouldSimplify = true; + while (ShouldSimplify) { + ShouldSimplify = sinkScalarOperands(Plan); + ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); + ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(Plan); + } +} bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) { SmallVector<VPBasicBlock *> WorkList; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -395,7 +459,10 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { // everything WidenNewIV's users need. That is, WidenOriginalIV will // generate a vector phi or all users of WidenNewIV demand the first lane // only. - if (WidenOriginalIV->needsVectorIV() || + if (any_of(WidenOriginalIV->users(), + [WidenOriginalIV](VPUser *U) { + return !U->usesScalars(WidenOriginalIV); + }) || vputils::onlyFirstLaneUsed(WidenNewIV)) { WidenNewIV->replaceAllUsesWith(WidenOriginalIV); WidenNewIV->eraseFromParent(); @@ -440,10 +507,10 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { if (Instruction *TruncI = WideIV->getTruncInst()) ResultTy = TruncI->getType(); const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE); + VPValue *Step = WideIV->getStepValue(); VPValue *BaseIV = CanonicalIV; - if (!CanonicalIV->isCanonical(ID, ResultTy)) { + if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step, + ResultTy)) { BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV, Step, ResultTy); HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); @@ -522,9 +589,9 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, return; LLVMContext &Ctx = SE.getContext(); - auto *BOC = - new VPInstruction(VPInstruction::BranchOnCond, - {Plan.getOrAddExternalDef(ConstantInt::getTrue(Ctx))}); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx))}); Term->eraseFromParent(); ExitingVPBB->appendRecipe(BOC); Plan.setVF(BestVF); @@ -533,3 +600,181 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, // 1. Replace inductions with constants. // 2. Replace vector loop region with VPBasicBlock. } + +#ifndef NDEBUG +static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { + auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + if (Region && Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && + Region->getNumPredecessors() == 1 && "Expected SESE region!"); + assert(R->getParent()->size() == 1 && + "A recipe in an original replicator region must be the only " + "recipe in its block"); + return Region; + } + return nullptr; +} +#endif + +static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B, + VPDominatorTree &VPDT) { + if (A == B) + return false; + + auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { + for (auto &R : *A->getParent()) { + if (&R == A) + return true; + if (&R == B) + return false; + } + llvm_unreachable("recipe not found"); + }; + const VPBlockBase *ParentA = A->getParent(); + const VPBlockBase *ParentB = B->getParent(); + if (ParentA == ParentB) + return LocalComesBefore(A, B); + + assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) && + "No replicate regions expected at this point"); + assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) && + "No replicate regions expected at this point"); + return VPDT.properlyDominates(ParentA, ParentB); +} + +/// Sink users of \p FOR after the recipe defining the previous value \p +/// Previous of the recurrence. \returns true if all users of \p FOR could be +/// re-arranged as needed or false if it is not possible. +static bool +sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, + VPRecipeBase *Previous, + VPDominatorTree &VPDT) { + // Collect recipes that need sinking. + SmallVector<VPRecipeBase *> WorkList; + SmallPtrSet<VPRecipeBase *, 8> Seen; + Seen.insert(Previous); + auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { + // The previous value must not depend on the users of the recurrence phi. In + // that case, FOR is not a fixed order recurrence. + if (SinkCandidate == Previous) + return false; + + if (isa<VPHeaderPHIRecipe>(SinkCandidate) || + !Seen.insert(SinkCandidate).second || + properlyDominates(Previous, SinkCandidate, VPDT)) + return true; + + if (SinkCandidate->mayHaveSideEffects()) + return false; + + WorkList.push_back(SinkCandidate); + return true; + }; + + // Recursively sink users of FOR after Previous. + WorkList.push_back(FOR); + for (unsigned I = 0; I != WorkList.size(); ++I) { + VPRecipeBase *Current = WorkList[I]; + assert(Current->getNumDefinedValues() == 1 && + "only recipes with a single defined value expected"); + + for (VPUser *User : Current->getVPSingleValue()->users()) { + if (auto *R = dyn_cast<VPRecipeBase>(User)) + if (!TryToPushSinkCandidate(R)) + return false; + } + } + + // Keep recipes to sink ordered by dominance so earlier instructions are + // processed first. + sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { + return properlyDominates(A, B, VPDT); + }); + + for (VPRecipeBase *SinkCandidate : WorkList) { + if (SinkCandidate == FOR) + continue; + + SinkCandidate->moveAfter(Previous); + Previous = SinkCandidate; + } + return true; +} + +bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, + VPBuilder &Builder) { + VPDominatorTree VPDT; + VPDT.recalculate(Plan); + + SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; + for (VPRecipeBase &R : + Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) + if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + RecurrencePhis.push_back(FOR); + + for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { + SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; + VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); + // Fixed-order recurrences do not contain cycles, so this loop is guaranteed + // to terminate. + while (auto *PrevPhi = + dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) { + assert(PrevPhi->getParent() == FOR->getParent()); + assert(SeenPhis.insert(PrevPhi).second); + Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); + } + + if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) + return false; + + // Introduce a recipe to combine the incoming and previous values of a + // fixed-order recurrence. + VPBasicBlock *InsertBlock = Previous->getParent(); + if (isa<VPHeaderPHIRecipe>(Previous)) + Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); + else + Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator())); + + auto *RecurSplice = cast<VPInstruction>( + Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, + {FOR, FOR->getBackedgeValue()})); + + FOR->replaceAllUsesWith(RecurSplice); + // Set the first operand of RecurSplice to FOR again, after replacing + // all users. + RecurSplice->setOperand(0, FOR); + } + return true; +} + +void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { + for (VPRecipeBase &R : + Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); + if (!PhiR) + continue; + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurKind::Add && RK != RecurKind::Mul) + continue; + + SmallSetVector<VPValue *, 8> Worklist; + Worklist.insert(PhiR); + + for (unsigned I = 0; I != Worklist.size(); ++I) { + VPValue *Cur = Worklist[I]; + if (auto *RecWithFlags = + dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) { + RecWithFlags->dropPoisonGeneratingFlags(); + } + + for (VPUser *U : Cur->users()) { + auto *UserRecipe = dyn_cast<VPRecipeBase>(U); + if (!UserRecipe) + continue; + for (VPValue *V : UserRecipe->definedValues()) + Worklist.insert(V); + } + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index be0d8e76d809..3eccf6e9600d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -25,23 +25,23 @@ class ScalarEvolution; class Loop; class PredicatedScalarEvolution; class TargetLibraryInfo; +class VPBuilder; +class VPRecipeBuilder; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void - VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan, + VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, - SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE, const TargetLibraryInfo &TLI); - static bool sinkScalarOperands(VPlan &Plan); - - /// Merge replicate regions in their successor region, if a replicate region - /// is connected to a successor replicate region with the same predicate by a - /// single, empty VPBasicBlock. - static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan); + /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then + /// region block and remove the mask operand. Optimize the created regions by + /// iteratively sinking scalar operands into the region, followed by merging + /// regions until no improvements are remaining. + static void createAndOptimizeReplicateRegions(VPlan &Plan); /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. @@ -71,6 +71,19 @@ struct VPlanTransforms { /// them with already existing recipes expanding the same SCEV expression. static void removeRedundantExpandSCEVRecipes(VPlan &Plan); + /// Sink users of fixed-order recurrences after the recipe defining their + /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions + /// to combine the value from the recurrence phis and previous values. The + /// current implementation assumes all users can be sunk after the previous + /// value, which is enforced by earlier legality checks. + /// \returns true if all users of fixed-order recurrences could be re-arranged + /// as needed or false if it is not possible. In the latter case, \p Plan is + /// not valid. + static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + + /// Clear NSW/NUW flags from reduction instructions if necessary. + static void clearReductionWrapFlags(VPlan &Plan); + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 62ec65cbfe5d..ac110bb3b0ef 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -171,16 +171,19 @@ public: /// Returns true if this VPValue is defined by a recipe. bool hasDefiningRecipe() const { return getDefiningRecipe(); } + /// Returns true if this VPValue is a live-in, i.e. defined outside the VPlan. + bool isLiveIn() const { return !hasDefiningRecipe(); } + /// Returns the underlying IR value, if this VPValue is defined outside the /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef /// inside a VPlan. Value *getLiveInIRValue() { - assert(!hasDefiningRecipe() && + assert(isLiveIn() && "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); return getUnderlyingValue(); } const Value *getLiveInIRValue() const { - assert(!hasDefiningRecipe() && + assert(isLiveIn() && "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); return getUnderlyingValue(); } @@ -342,15 +345,16 @@ public: VPScalarIVStepsSC, VPWidenCallSC, VPWidenCanonicalIVSC, + VPWidenCastSC, VPWidenGEPSC, VPWidenMemoryInstructionSC, VPWidenSC, VPWidenSelectSC, - - // Phi-like recipes. Need to be kept together. + // START: Phi-like recipes. Need to be kept together. VPBlendSC, VPPredInstPHISC, - // Header-phi recipes. Need to be kept together. + // START: SubclassID for recipes that inherit VPHeaderPHIRecipe. + // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, @@ -358,8 +362,11 @@ public: VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, VPReductionPHISC, + // END: SubclassID for recipes that inherit VPHeaderPHIRecipe + // END: Phi-like recipes VPFirstPHISC = VPBlendSC, VPFirstHeaderPHISC = VPCanonicalIVPHISC, + VPLastHeaderPHISC = VPReductionPHISC, VPLastPHISC = VPReductionPHISC, }; @@ -434,6 +441,7 @@ class VPSlotTracker { void assignSlot(const VPValue *V); void assignSlots(const VPlan &Plan); + void assignSlots(const VPBasicBlock *VPBB); public: VPSlotTracker(const VPlan *Plan = nullptr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 18125cebed33..d6b81543dbc9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -15,6 +15,7 @@ #include "VPlanVerifier.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/Support/CommandLine.h" @@ -189,9 +190,8 @@ static bool verifyPhiRecipes(const VPBasicBlock *VPBB) { return true; } -static bool -verifyVPBasicBlock(const VPBasicBlock *VPBB, - DenseMap<const VPBlockBase *, unsigned> &BlockNumbering) { +static bool verifyVPBasicBlock(const VPBasicBlock *VPBB, + VPDominatorTree &VPDT) { if (!verifyPhiRecipes(VPBB)) return false; @@ -206,7 +206,8 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB, for (const VPValue *V : R.definedValues()) { for (const VPUser *U : V->users()) { auto *UI = dyn_cast<VPRecipeBase>(U); - if (!UI || isa<VPHeaderPHIRecipe>(UI)) + // TODO: check dominance of incoming values for phis properly. + if (!UI || isa<VPHeaderPHIRecipe>(UI) || isa<VPPredInstPHIRecipe>(UI)) continue; // If the user is in the same block, check it comes after R in the @@ -219,27 +220,7 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB, continue; } - // Skip blocks outside any region for now and blocks outside - // replicate-regions. - auto *ParentR = VPBB->getParent(); - if (!ParentR || !ParentR->isReplicator()) - continue; - - // For replicators, verify that VPPRedInstPHIRecipe defs are only used - // in subsequent blocks. - if (isa<VPPredInstPHIRecipe>(&R)) { - auto I = BlockNumbering.find(UI->getParent()); - unsigned BlockNumber = I == BlockNumbering.end() ? std::numeric_limits<unsigned>::max() : I->second; - if (BlockNumber < BlockNumbering[ParentR]) { - errs() << "Use before def!\n"; - return false; - } - continue; - } - - // All non-VPPredInstPHIRecipe recipes in the block must be used in - // the replicate region only. - if (UI->getParent()->getParent() != ParentR) { + if (!VPDT.dominates(VPBB, UI->getParent())) { errs() << "Use before def!\n"; return false; } @@ -250,15 +231,13 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB, } bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { - DenseMap<const VPBlockBase *, unsigned> BlockNumbering; - unsigned Cnt = 0; + VPDominatorTree VPDT; + VPDT.recalculate(const_cast<VPlan &>(Plan)); + auto Iter = vp_depth_first_deep(Plan.getEntry()); - for (const VPBlockBase *VPB : Iter) { - BlockNumbering[VPB] = Cnt++; - auto *VPBB = dyn_cast<VPBasicBlock>(VPB); - if (!VPBB) - continue; - if (!verifyVPBasicBlock(VPBB, BlockNumbering)) + for (const VPBasicBlock *VPBB : + VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { + if (!verifyVPBasicBlock(VPBB, VPDT)) return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 2e489757ebc1..13464c9d3496 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -25,11 +25,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" #include <numeric> #define DEBUG_TYPE "vector-combine" @@ -247,7 +244,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // still need a shuffle to change the vector size. auto *Ty = cast<FixedVectorType>(I.getType()); unsigned OutputNumElts = Ty->getNumElements(); - SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); + SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); Mask[0] = OffsetEltIndex; if (OffsetEltIndex) @@ -460,9 +457,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // If we are extracting from 2 different indexes, then one operand must be // shuffled before performing the vector operation. The shuffle mask is - // undefined except for 1 lane that is being translated to the remaining + // poison except for 1 lane that is being translated to the remaining // extraction lane. Therefore, it is a splat shuffle. Ex: - // ShufMask = { undef, undef, 0, undef } + // ShufMask = { poison, poison, 0, poison } // TODO: The cost model has an option for a "broadcast" shuffle // (splat-from-element-0), but no option for a more general splat. NewCost += @@ -479,11 +476,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, /// to a new element location. static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilder<> &Builder) { - // The shuffle mask is undefined except for 1 lane that is being translated + // The shuffle mask is poison except for 1 lane that is being translated // to the new element index. Example for OldIndex == 2 and NewIndex == 0: - // ShufMask = { 2, undef, undef, undef } + // ShufMask = { 2, poison, poison, poison } auto *VecTy = cast<FixedVectorType>(Vec->getType()); - SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); + SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); ShufMask[NewIndex] = OldIndex; return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); } @@ -917,7 +914,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); InstructionCost NewCost = TTI.getCmpSelInstrCost( CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); - SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); + SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); ShufMask[CheapIndex] = ExpensiveIndex; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); @@ -932,7 +929,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // Create a vector constant from the 2 scalar constants. SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), - UndefValue::get(VecTy->getElementType())); + PoisonValue::get(VecTy->getElementType())); CmpC[Index0] = C0; CmpC[Index1] = C1; Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); @@ -1565,7 +1562,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // Calculate our ReconstructMasks from the OrigReconstructMasks and the // modified order of the input shuffles. SmallVector<SmallVector<int>> ReconstructMasks; - for (auto Mask : OrigReconstructMasks) { + for (const auto &Mask : OrigReconstructMasks) { SmallVector<int> ReconstructMask; for (int M : Mask) { auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { @@ -1596,12 +1593,12 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); } while (V1A.size() < NumElts) { - V1A.push_back(UndefMaskElem); - V1B.push_back(UndefMaskElem); + V1A.push_back(PoisonMaskElem); + V1B.push_back(PoisonMaskElem); } while (V2A.size() < NumElts) { - V2A.push_back(UndefMaskElem); - V2B.push_back(UndefMaskElem); + V2A.push_back(PoisonMaskElem); + V2B.push_back(PoisonMaskElem); } auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { @@ -1660,16 +1657,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { return SSV->getOperand(Op); return SV->getOperand(Op); }; - Builder.SetInsertPoint(SVI0A->getNextNode()); + Builder.SetInsertPoint(SVI0A->getInsertionPointAfterDef()); Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), GetShuffleOperand(SVI0A, 1), V1A); - Builder.SetInsertPoint(SVI0B->getNextNode()); + Builder.SetInsertPoint(SVI0B->getInsertionPointAfterDef()); Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), GetShuffleOperand(SVI0B, 1), V1B); - Builder.SetInsertPoint(SVI1A->getNextNode()); + Builder.SetInsertPoint(SVI1A->getInsertionPointAfterDef()); Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), GetShuffleOperand(SVI1A, 1), V2A); - Builder.SetInsertPoint(SVI1B->getNextNode()); + Builder.SetInsertPoint(SVI1B->getInsertionPointAfterDef()); Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), GetShuffleOperand(SVI1B, 1), V2B); Builder.SetInsertPoint(Op0); @@ -1811,54 +1808,6 @@ bool VectorCombine::run() { return MadeChange; } -// Pass manager boilerplate below here. - -namespace { -class VectorCombineLegacyPass : public FunctionPass { -public: - static char ID; - VectorCombineLegacyPass() : FunctionPass(ID) { - initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.setPreservesCFG(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - FunctionPass::getAnalysisUsage(AU); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); - VectorCombine Combiner(F, TTI, DT, AA, AC, false); - return Combiner.run(); - } -}; -} // namespace - -char VectorCombineLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine", - "Optimize scalar/vector ops", false, - false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine", - "Optimize scalar/vector ops", false, false) -Pass *llvm::createVectorCombinePass() { - return new VectorCombineLegacyPass(); -} - PreservedAnalyses VectorCombinePass::run(Function &F, FunctionAnalysisManager &FAM) { auto &AC = FAM.getResult<AssumptionAnalysis>(F); diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp index 208e5eeea864..2f5048d2a664 100644 --- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -12,10 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Vectorize.h" -#include "llvm-c/Initialization.h" -#include "llvm-c/Transforms/Vectorize.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" @@ -23,20 +19,5 @@ using namespace llvm; /// Initialize all passes linked into the Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { - initializeLoopVectorizePass(Registry); - initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerLegacyPassPass(Registry); - initializeVectorCombineLegacyPassPass(Registry); -} - -void LLVMInitializeVectorization(LLVMPassRegistryRef R) { - initializeVectorization(*unwrap(R)); -} - -void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLoopVectorizePass()); -} - -void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createSLPVectorizerPass()); } |