aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2049
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp188
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h64
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2346
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4239
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h24
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp116
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h680
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanCFG.h1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp385
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp327
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h29
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h18
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp45
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp85
-rw-r--r--llvm/lib/Transforms/Vectorize/Vectorize.cpp19
18 files changed, 6349 insertions, 4296 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 0b7fc853dc1b..260d7889906b 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -37,13 +37,34 @@
// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
// could use this pass (with some modifications), but currently it implements
// its own pass to do something similar to what we do here.
+//
+// Overview of the algorithm and terminology in this pass:
+//
+// - Break up each basic block into pseudo-BBs, composed of instructions which
+// are guaranteed to transfer control to their successors.
+// - Within a single pseudo-BB, find all loads, and group them into
+// "equivalence classes" according to getUnderlyingObject() and loaded
+// element size. Do the same for stores.
+// - For each equivalence class, greedily build "chains". Each chain has a
+// leader instruction, and every other member of the chain has a known
+// constant offset from the first instr in the chain.
+// - Break up chains so that they contain only contiguous accesses of legal
+// size with no intervening may-alias instrs.
+// - Convert each chain to vector instructions.
+//
+// The O(n^2) behavior of this pass comes from initially building the chains.
+// In the worst case we have to compare each new instruction to all of those
+// that came before. To limit this, we only calculate the offset to the leaders
+// of the N most recently-used chains.
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -57,6 +78,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -67,23 +89,33 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
+#include <cstdint>
#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <optional>
#include <tuple>
+#include <type_traits>
#include <utility>
+#include <vector>
using namespace llvm;
@@ -92,21 +124,115 @@ using namespace llvm;
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+namespace {
+
+// Equivalence class key, the initial tuple by which we group loads/stores.
+// Loads/stores with different EqClassKeys are never merged.
+//
+// (We could in theory remove element-size from the this tuple. We'd just need
+// to fix up the vector packing/unpacking code.)
+using EqClassKey =
+ std::tuple<const Value * /* result of getUnderlyingObject() */,
+ unsigned /* AddrSpace */,
+ unsigned /* Load/Store element size bits */,
+ char /* IsLoad; char b/c bool can't be a DenseMap key */
+ >;
+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const EqClassKey &K) {
+ const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
+ return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
+ << " of element size " << ElementSize << " bits in addrspace "
+ << AddrSpace;
+}
+
+// A Chain is a set of instructions such that:
+// - All instructions have the same equivalence class, so in particular all are
+// loads, or all are stores.
+// - We know the address accessed by the i'th chain elem relative to the
+// chain's leader instruction, which is the first instr of the chain in BB
+// order.
+//
+// Chains have two canonical orderings:
+// - BB order, sorted by Instr->comesBefore.
+// - Offset order, sorted by OffsetFromLeader.
+// This pass switches back and forth between these orders.
+struct ChainElem {
+ Instruction *Inst;
+ APInt OffsetFromLeader;
+};
+using Chain = SmallVector<ChainElem, 1>;
+
+void sortChainInBBOrder(Chain &C) {
+ sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
+}
+
+void sortChainInOffsetOrder(Chain &C) {
+ sort(C, [](const auto &A, const auto &B) {
+ if (A.OffsetFromLeader != B.OffsetFromLeader)
+ return A.OffsetFromLeader.slt(B.OffsetFromLeader);
+ return A.Inst->comesBefore(B.Inst); // stable tiebreaker
+ });
+}
+
+[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
+ for (const auto &E : C) {
+ dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
+ }
+}
+
+using EquivalenceClassMap =
+ MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
+
// FIXME: Assuming stack alignment of 4 is always good enough
-static const unsigned StackAdjustedAlignment = 4;
+constexpr unsigned StackAdjustedAlignment = 4;
-namespace {
+Instruction *propagateMetadata(Instruction *I, const Chain &C) {
+ SmallVector<Value *, 8> Values;
+ for (const ChainElem &E : C)
+ Values.push_back(E.Inst);
+ return propagateMetadata(I, Values);
+}
-/// ChainID is an arbitrary token that is allowed to be different only for the
-/// accesses that are guaranteed to be considered non-consecutive by
-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
-/// together and reducing the number of instructions the main search operates on
-/// at a time, i.e. this is to reduce compile time and nothing else as the main
-/// search has O(n^2) time complexity. The underlying type of ChainID should not
-/// be relied upon.
-using ChainID = const Value *;
-using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<ChainID, InstrList>;
+bool isInvariantLoad(const Instruction *I) {
+ const LoadInst *LI = dyn_cast<LoadInst>(I);
+ return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
+}
+
+/// Reorders the instructions that I depends on (the instructions defining its
+/// operands), to ensure they dominate I.
+void reorder(Instruction *I) {
+ SmallPtrSet<Instruction *, 16> InstructionsToMove;
+ SmallVector<Instruction *, 16> Worklist;
+
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *IW = Worklist.pop_back_val();
+ int NumOperands = IW->getNumOperands();
+ for (int i = 0; i < NumOperands; i++) {
+ Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+ if (!IM || IM->getOpcode() == Instruction::PHI)
+ continue;
+
+ // If IM is in another BB, no need to move it, because this pass only
+ // vectorizes instructions within one BB.
+ if (IM->getParent() != I->getParent())
+ continue;
+
+ if (!IM->comesBefore(I)) {
+ InstructionsToMove.insert(IM);
+ Worklist.push_back(IM);
+ }
+ }
+ }
+
+ // All instructions to move should follow I. Start from I, not from begin().
+ for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
+ Instruction *IM = &*(BBI++);
+ if (!InstructionsToMove.count(IM))
+ continue;
+ IM->moveBefore(I);
+ }
+}
class Vectorizer {
Function &F;
@@ -118,6 +244,12 @@ class Vectorizer {
const DataLayout &DL;
IRBuilder<> Builder;
+ // We could erase instrs right after vectorizing them, but that can mess up
+ // our BB iterators, and also can make the equivalence class keys point to
+ // freed memory. This is fixable, but it's simpler just to wait until we're
+ // done with the BB and erase all at once.
+ SmallVector<Instruction *, 128> ToErase;
+
public:
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
@@ -127,70 +259,83 @@ public:
bool run();
private:
- unsigned getPointerAddressSpace(Value *I);
-
static const unsigned MaxDepth = 3;
- bool isConsecutiveAccess(Value *A, Value *B);
- bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth = 0) const;
- bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth) const;
- bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
- unsigned Depth) const;
-
- /// After vectorization, reorder the instructions that I depends on
- /// (the instructions defining its operands), to ensure they dominate I.
- void reorder(Instruction *I);
-
- /// Returns the first and the last instructions in Chain.
- std::pair<BasicBlock::iterator, BasicBlock::iterator>
- getBoundaryInstrs(ArrayRef<Instruction *> Chain);
-
- /// Erases the original instructions after vectorizing.
- void eraseInstructions(ArrayRef<Instruction *> Chain);
-
- /// "Legalize" the vector type that would be produced by combining \p
- /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
- /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
- /// expected to have more than 4 elements.
- std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
- splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
-
- /// Finds the largest prefix of Chain that's vectorizable, checking for
- /// intervening instructions which may affect the memory accessed by the
- /// instructions within Chain.
+ /// Runs the vectorizer on a "pseudo basic block", which is a range of
+ /// instructions [Begin, End) within one BB all of which have
+ /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
+ bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
+
+ /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
+ /// in the same BB with the same value for getUnderlyingObject() etc.
+ bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass);
+
+ /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
+ /// where all instructions access a known, constant offset from the first
+ /// instruction.
+ bool runOnChain(Chain &C);
+
+ /// Splits the chain into subchains of instructions which read/write a
+ /// contiguous block of memory. Discards any length-1 subchains (because
+ /// there's nothing to vectorize in there).
+ std::vector<Chain> splitChainByContiguity(Chain &C);
+
+ /// Splits the chain into subchains where it's safe to hoist loads up to the
+ /// beginning of the sub-chain and it's safe to sink loads up to the end of
+ /// the sub-chain. Discards any length-1 subchains.
+ std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
+
+ /// Splits the chain into subchains that make legal, aligned accesses.
+ /// Discards any length-1 subchains.
+ std::vector<Chain> splitChainByAlignment(Chain &C);
+
+ /// Converts the instrs in the chain into a single vectorized load or store.
+ /// Adds the old scalar loads/stores to ToErase.
+ bool vectorizeChain(Chain &C);
+
+ /// Tries to compute the offset in bytes PtrB - PtrA.
+ std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth = 0);
+ std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth);
+ std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth);
+
+ /// Gets the element type of the vector that the chain will load or store.
+ /// This is nontrivial because the chain may contain elements of different
+ /// types; e.g. it's legal to have a chain that contains both i32 and float.
+ Type *getChainElemTy(const Chain &C);
+
+ /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
+ /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
+ /// instructions.
+ ///
+ /// The map ChainElemOffsets must contain all of the elements in
+ /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
+ /// address. It's ok if it contains additional entries.
+ template <bool IsLoadChain>
+ bool isSafeToMove(
+ Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+
+ /// Collects loads and stores grouped by "equivalence class", where:
+ /// - all elements in an eq class are a load or all are a store,
+ /// - they all load/store the same element size (it's OK to have e.g. i8 and
+ /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
+ /// - they all have the same value for getUnderlyingObject().
+ EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
+ BasicBlock::iterator End);
+
+ /// Partitions Instrs into "chains" where every instruction has a known
+ /// constant offset from the first instr in the chain.
///
- /// The elements of \p Chain must be all loads or all stores and must be in
- /// address order.
- ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
-
- /// Collects load and store instructions to vectorize.
- std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
-
- /// Processes the collected instructions, the \p Map. The values of \p Map
- /// should be all loads or all stores.
- bool vectorizeChains(InstrListMap &Map);
-
- /// Finds the load/stores to consecutive memory addresses and vectorizes them.
- bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
-
- /// Vectorizes the load instructions in Chain.
- bool
- vectorizeLoadChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Vectorizes the store instructions in Chain.
- bool
- vectorizeStoreChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Check if this load/store access is misaligned accesses.
- /// Returns a \p RelativeSpeed of an operation if allowed suitable to
- /// compare to another result for the same \p AddressSpace and potentially
- /// different \p Alignment and \p SzInBytes.
- bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- Align Alignment, unsigned &RelativeSpeed);
+ /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
+ /// in the chain is the leader, and an instr touches distance 0 from itself.
+ std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -198,7 +343,8 @@ public:
static char ID;
LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
- initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ initializeLoadStoreVectorizerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -250,11 +396,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- Vectorizer V(F, AA, AC, DT, SE, TTI);
- return V.run();
+ return Vectorizer(F, AA, AC, DT, SE, TTI).run();
}
-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return PreservedAnalyses::all();
@@ -265,125 +411,681 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
- Vectorizer V(F, AA, AC, DT, SE, TTI);
- bool Changed = V.run();
+ bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return Changed ? PA : PreservedAnalyses::all();
}
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
-// vectors of Instructions.
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
- SmallVector<Value *, 8> VL(IL.begin(), IL.end());
- propagateMetadata(I, VL);
-}
-
-// Vectorizer Implementation
bool Vectorizer::run() {
bool Changed = false;
-
- // Scan the blocks in the function in post order.
+ // Break up the BB if there are any instrs which aren't guaranteed to transfer
+ // execution to their successor.
+ //
+ // Consider, for example:
+ //
+ // def assert_arr_len(int n) { if (n < 2) exit(); }
+ //
+ // load arr[0]
+ // call assert_array_len(arr.length)
+ // load arr[1]
+ //
+ // Even though assert_arr_len does not read or write any memory, we can't
+ // speculate the second load before the call. More info at
+ // https://github.com/llvm/llvm-project/issues/52950.
for (BasicBlock *BB : post_order(&F)) {
- InstrListMap LoadRefs, StoreRefs;
- std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
- Changed |= vectorizeChains(LoadRefs);
- Changed |= vectorizeChains(StoreRefs);
+ // BB must at least have a terminator.
+ assert(!BB->empty());
+
+ SmallVector<BasicBlock::iterator, 8> Barriers;
+ Barriers.push_back(BB->begin());
+ for (Instruction &I : *BB)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ Barriers.push_back(I.getIterator());
+ Barriers.push_back(BB->end());
+
+ for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
+ ++It)
+ Changed |= runOnPseudoBB(*It, *std::next(It));
+
+ for (Instruction *I : ToErase) {
+ auto *PtrOperand = getLoadStorePointerOperand(I);
+ if (I->use_empty())
+ I->eraseFromParent();
+ RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+ }
+ ToErase.clear();
}
return Changed;
}
-unsigned Vectorizer::getPointerAddressSpace(Value *I) {
- if (LoadInst *L = dyn_cast<LoadInst>(I))
- return L->getPointerAddressSpace();
- if (StoreInst *S = dyn_cast<StoreInst>(I))
- return S->getPointerAddressSpace();
- return -1;
+bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
+ BasicBlock::iterator End) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
+ if (End != Begin->getParent()->end())
+ dbgs() << *End;
+ else
+ dbgs() << "<BB end>";
+ dbgs() << ")\n";
+ });
+
+ bool Changed = false;
+ for (const auto &[EqClassKey, EqClass] :
+ collectEquivalenceClasses(Begin, End))
+ Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+
+ return Changed;
}
-// FIXME: Merge with llvm::isConsecutiveAccess
-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
- Value *PtrA = getLoadStorePointerOperand(A);
- Value *PtrB = getLoadStorePointerOperand(B);
- unsigned ASA = getPointerAddressSpace(A);
- unsigned ASB = getPointerAddressSpace(B);
+bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass) {
+ bool Changed = false;
- // Check that the address spaces match and that the pointers are valid.
- if (!PtrA || !PtrB || (ASA != ASB))
- return false;
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+ << " keyed on " << EqClassKey << ":\n";
+ for (Instruction *I : EqClass)
+ dbgs() << " " << *I << "\n";
+ });
- // Make sure that A and B are different pointers of the same size type.
- Type *PtrATy = getLoadStoreType(A);
- Type *PtrBTy = getLoadStoreType(B);
- if (PtrA == PtrB ||
- PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
- DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
- DL.getTypeStoreSize(PtrATy->getScalarType()) !=
- DL.getTypeStoreSize(PtrBTy->getScalarType()))
- return false;
+ std::vector<Chain> Chains = gatherChains(EqClass);
+ LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
+ << " nontrivial chains.\n";);
+ for (Chain &C : Chains)
+ Changed |= runOnChain(C);
+ return Changed;
+}
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
- APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+bool Vectorizer::runOnChain(Chain &C) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
+ dumpChain(C);
+ });
- return areConsecutivePointers(PtrA, PtrB, Size);
+ // Split up the chain into increasingly smaller chains, until we can finally
+ // vectorize the chains.
+ //
+ // (Don't be scared by the depth of the loop nest here. These operations are
+ // all at worst O(n lg n) in the number of instructions, and splitting chains
+ // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
+ bool Changed = false;
+ for (auto &C : splitChainByMayAliasInstrs(C))
+ for (auto &C : splitChainByContiguity(C))
+ for (auto &C : splitChainByAlignment(C))
+ Changed |= vectorizeChain(C);
+ return Changed;
}
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
- APInt PtrDelta, unsigned Depth) const {
- unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
- APInt OffsetA(PtrBitWidth, 0);
- APInt OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
+ if (C.empty())
+ return {};
- unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+ sortChainInBBOrder(C);
- if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
+ dumpChain(C);
+ });
+
+ // We know that elements in the chain with nonverlapping offsets can't
+ // alias, but AA may not be smart enough to figure this out. Use a
+ // hashtable.
+ DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
+ for (const auto &E : C)
+ ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
+
+ // Loads get hoisted up to the first load in the chain. Stores get sunk
+ // down to the last store in the chain. Our algorithm for loads is:
+ //
+ // - Take the first element of the chain. This is the start of a new chain.
+ // - Take the next element of `Chain` and check for may-alias instructions
+ // up to the start of NewChain. If no may-alias instrs, add it to
+ // NewChain. Otherwise, start a new NewChain.
+ //
+ // For stores it's the same except in the reverse direction.
+ //
+ // We expect IsLoad to be an std::bool_constant.
+ auto Impl = [&](auto IsLoad) {
+ // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
+ auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
+ if constexpr (IsLoad())
+ return std::make_pair(C.begin(), C.end());
+ else
+ return std::make_pair(C.rbegin(), C.rend());
+ }(IsLoad);
+ assert(ChainBegin != ChainEnd);
+
+ std::vector<Chain> Chains;
+ SmallVector<ChainElem, 1> NewChain;
+ NewChain.push_back(*ChainBegin);
+ for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
+ if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
+ ChainOffsets)) {
+ LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
+ << *ChainIt->Inst << " into " << *ChainBegin->Inst
+ << "\n");
+ NewChain.push_back(*ChainIt);
+ } else {
+ LLVM_DEBUG(
+ dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
+ << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
+ if (NewChain.size() > 1) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+ dumpChain(NewChain);
+ });
+ Chains.push_back(std::move(NewChain));
+ }
+
+ // Start a new chain.
+ NewChain = SmallVector<ChainElem, 1>({*ChainIt});
+ }
+ }
+ if (NewChain.size() > 1) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+ dumpChain(NewChain);
+ });
+ Chains.push_back(std::move(NewChain));
+ }
+ return Chains;
+ };
+
+ if (isa<LoadInst>(C[0].Inst))
+ return Impl(/*IsLoad=*/std::bool_constant<true>());
+
+ assert(isa<StoreInst>(C[0].Inst));
+ return Impl(/*IsLoad=*/std::bool_constant<false>());
+}
+
+std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
+ if (C.empty())
+ return {};
+
+ sortChainInOffsetOrder(C);
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByContiguity considering chain:\n";
+ dumpChain(C);
+ });
+
+ std::vector<Chain> Ret;
+ Ret.push_back({C.front()});
+
+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+ // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
+ auto &CurChain = Ret.back();
+ const ChainElem &Prev = CurChain.back();
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+ "collectEquivalenceClass");
+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+
+ // Add this instruction to the end of the current chain, or start a new one.
+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+ << (AreContiguous ? "" : "not ") << "contiguous: "
+ << *Prev.Inst << " (ends at offset " << PrevReadEnd
+ << ") -> " << *It->Inst << " (starts at offset "
+ << It->OffsetFromLeader << ")\n");
+ if (AreContiguous)
+ CurChain.push_back(*It);
+ else
+ Ret.push_back({*It});
+ }
+
+ // Filter out length-1 chains, these are uninteresting.
+ llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
+ return Ret;
+}
+
+Type *Vectorizer::getChainElemTy(const Chain &C) {
+ assert(!C.empty());
+ // The rules are:
+ // - If there are any pointer types in the chain, use an integer type.
+ // - Prefer an integer type if it appears in the chain.
+ // - Otherwise, use the first type in the chain.
+ //
+ // The rule about pointer types is a simplification when we merge e.g. a load
+ // of a ptr and a double. There's no direct conversion from a ptr to a
+ // double; it requires a ptrtoint followed by a bitcast.
+ //
+ // It's unclear to me if the other rules have any practical effect, but we do
+ // it to match this pass's previous behavior.
+ if (any_of(C, [](const ChainElem &E) {
+ return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
+ })) {
+ return Type::getIntNTy(
+ F.getContext(),
+ DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
+ }
+
+ for (const ChainElem &E : C)
+ if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
+ return T;
+ return getLoadStoreType(C[0].Inst)->getScalarType();
+}
+
+std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
+ // We use a simple greedy algorithm.
+ // - Given a chain of length N, find all prefixes that
+ // (a) are not longer than the max register length, and
+ // (b) are a power of 2.
+ // - Starting from the longest prefix, try to create a vector of that length.
+ // - If one of them works, great. Repeat the algorithm on any remaining
+ // elements in the chain.
+ // - If none of them work, discard the first element and repeat on a chain
+ // of length N-1.
+ if (C.empty())
+ return {};
+
+ sortChainInOffsetOrder(C);
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByAlignment considering chain:\n";
+ dumpChain(C);
+ });
+
+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+ auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
+ unsigned ChainSizeBytes, VectorType *VecTy) {
+ return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
+ ChainSizeBytes, VecTy)
+ : TTI.getStoreVectorFactor(VF, LoadStoreSize,
+ ChainSizeBytes, VecTy);
+ };
+
+#ifndef NDEBUG
+ for (const auto &E : C) {
+ Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+ assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
+ "Should have filtered out non-power-of-two elements in "
+ "collectEquivalenceClasses.");
+ }
+#endif
+
+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+ unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+
+ std::vector<Chain> Ret;
+ for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
+ // Find candidate chains of size not greater than the largest vector reg.
+ // These chains are over the closed interval [CBegin, CEnd].
+ SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
+ CandidateChains;
+ for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
+ APInt Sz = C[CEnd].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
+ C[CBegin].OffsetFromLeader;
+ if (Sz.sgt(VecRegBytes))
+ break;
+ CandidateChains.push_back(
+ {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
+ }
+
+ // Consider the longest chain first.
+ for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
+ It != End; ++It) {
+ auto [CEnd, SizeBytes] = *It;
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
+ << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
+
+ Type *VecElemTy = getChainElemTy(C);
+ // Note, VecElemTy is a power of 2, but might be less than one byte. For
+ // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
+ // VecElemTy would be i4.
+ unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
+
+ // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
+ assert((8 * SizeBytes) % VecElemBits == 0);
+ unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
+ FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
+ unsigned VF = 8 * VecRegBytes / VecElemBits;
+
+ // Check that TTI is happy with this vectorization factor.
+ unsigned TargetVF = getVectorFactor(VF, VecElemBits,
+ VecElemBits * NumVecElems / 8, VecTy);
+ if (TargetVF != VF && TargetVF < NumVecElems) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because TargetVF="
+ << TargetVF << " != VF=" << VF
+ << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
+ continue;
+ }
+
+ // Is a load/store with this alignment allowed by TTI and at least as fast
+ // as an unvectorized load/store?
+ //
+ // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
+ auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
+ &F = F](Align Alignment) {
+ if (Alignment.value() % SizeBytes == 0)
+ return true;
+ unsigned VectorizedSpeed = 0;
+ bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
+ F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
+ if (!AllowsMisaligned) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: Access of " << SizeBytes << "B in addrspace "
+ << AS << " with alignment " << Alignment.value()
+ << " is misaligned, and therefore can't be vectorized.\n");
+ return false;
+ }
+
+ unsigned ElementwiseSpeed = 0;
+ (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
+ Alignment, &ElementwiseSpeed);
+ if (VectorizedSpeed < ElementwiseSpeed) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: Access of " << SizeBytes << "B in addrspace "
+ << AS << " with alignment " << Alignment.value()
+ << " has relative speed " << VectorizedSpeed
+ << ", which is lower than the elementwise speed of "
+ << ElementwiseSpeed
+ << ". Therefore this access won't be vectorized.\n");
+ return false;
+ }
+ return true;
+ };
+
+ // If we're loading/storing from an alloca, align it if possible.
+ //
+ // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
+ // tells us this is beneficial. This feels a bit odd, but it matches
+ // existing tests. This isn't *so* bad, because at most we align to 4
+ // bytes (current value of StackAdjustedAlignment).
+ //
+ // FIXME: We will upgrade the alignment of the alloca even if it turns out
+ // we can't vectorize for some other reason.
+ Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
+ bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
+ isa<AllocaInst>(PtrOperand->stripPointerCasts());
+ Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
+ Align PrefAlign = Align(StackAdjustedAlignment);
+ if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
+ IsAllowedAndFast(PrefAlign)) {
+ Align NewAlign = getOrEnforceKnownAlignment(
+ PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
+ if (NewAlign >= Alignment) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: splitByChain upgrading alloca alignment from "
+ << Alignment.value() << " to " << NewAlign.value()
+ << "\n");
+ Alignment = NewAlign;
+ }
+ }
+
+ if (!IsAllowedAndFast(Alignment)) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because its alignment is not AllowedAndFast: "
+ << Alignment.value() << "\n");
+ continue;
+ }
+
+ if ((IsLoadChain &&
+ !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
+ (!IsLoadChain &&
+ !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because !isLegalToVectorizeLoad/StoreChain.");
+ continue;
+ }
+
+ // Hooray, we can vectorize this chain!
+ Chain &NewChain = Ret.emplace_back();
+ for (unsigned I = CBegin; I <= CEnd; ++I)
+ NewChain.push_back(C[I]);
+ CBegin = CEnd; // Skip over the instructions we've added to the chain.
+ break;
+ }
+ }
+ return Ret;
+}
+
+bool Vectorizer::vectorizeChain(Chain &C) {
+ if (C.size() < 2)
return false;
- // In case if we have to shrink the pointer
- // stripAndAccumulateInBoundsConstantOffsets should properly handle a
- // possible overflow and the value should fit into a smallest data type
- // used in the cast/gep chain.
- assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
- OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+ sortChainInOffsetOrder(C);
- OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
- OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
- PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+ LLVM_DEBUG({
+ dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
+ dumpChain(C);
+ });
- APInt OffsetDelta = OffsetB - OffsetA;
+ Type *VecElemTy = getChainElemTy(C);
+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+ unsigned ChainBytes = std::accumulate(
+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ });
+ assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
+ // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
+ // than 1 byte (e.g. VecTy == <32 x i1>).
+ Type *VecTy = FixedVectorType::get(
+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+
+ Align Alignment = getLoadStoreAlignment(C[0].Inst);
+ // If this is a load/store of an alloca, we might have upgraded the alloca's
+ // alignment earlier. Get the new alignment.
+ if (AS == DL.getAllocaAddrSpace()) {
+ Alignment = std::max(
+ Alignment,
+ getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
+ MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
+ }
- // Check if they are based on the same pointer. That makes the offsets
- // sufficient.
- if (PtrA == PtrB)
- return OffsetDelta == PtrDelta;
-
- // Compute the necessary base pointer delta to have the necessary final delta
- // equal to the pointer delta requested.
- APInt BaseDelta = PtrDelta - OffsetDelta;
-
- // Compute the distance with SCEV between the base pointers.
- const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
- const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
- const SCEV *C = SE.getConstant(BaseDelta);
- const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
- if (X == PtrSCEVB)
+ // All elements of the chain must have the same scalar-type size.
+#ifndef NDEBUG
+ for (const ChainElem &E : C)
+ assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
+ DL.getTypeStoreSize(VecElemTy));
+#endif
+
+ Instruction *VecInst;
+ if (IsLoadChain) {
+ // Loads get hoisted to the location of the first load in the chain. We may
+ // also need to hoist the (transitive) operands of the loads.
+ Builder.SetInsertPoint(
+ std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
+ return A.Inst->comesBefore(B.Inst);
+ })->Inst);
+
+ // Chain is in offset order, so C[0] is the instr with the lowest offset,
+ // i.e. the root of the vector.
+ Value *Bitcast = Builder.CreateBitCast(
+ getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));
+ VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+
+ unsigned VecIdx = 0;
+ for (const ChainElem &E : C) {
+ Instruction *I = E.Inst;
+ Value *V;
+ Type *T = getLoadStoreType(I);
+ if (auto *VT = dyn_cast<FixedVectorType>(T)) {
+ auto Mask = llvm::to_vector<8>(
+ llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
+ V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
+ VecIdx += VT->getNumElements();
+ } else {
+ V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
+ I->getName());
+ ++VecIdx;
+ }
+ if (V->getType() != I->getType())
+ V = Builder.CreateBitOrPointerCast(V, I->getType());
+ I->replaceAllUsesWith(V);
+ }
+
+ // Finally, we need to reorder the instrs in the BB so that the (transitive)
+ // operands of VecInst appear before it. To see why, suppose we have
+ // vectorized the following code:
+ //
+ // ptr1 = gep a, 1
+ // load1 = load i32 ptr1
+ // ptr0 = gep a, 0
+ // load0 = load i32 ptr0
+ //
+ // We will put the vectorized load at the location of the earliest load in
+ // the BB, i.e. load1. We get:
+ //
+ // ptr1 = gep a, 1
+ // loadv = load <2 x i32> ptr0
+ // load0 = extractelement loadv, 0
+ // load1 = extractelement loadv, 1
+ // ptr0 = gep a, 0
+ //
+ // Notice that loadv uses ptr0, which is defined *after* it!
+ reorder(VecInst);
+ } else {
+ // Stores get sunk to the location of the last store in the chain.
+ Builder.SetInsertPoint(
+ std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
+ return A.Inst->comesBefore(B.Inst);
+ })->Inst);
+
+ // Build the vector to store.
+ Value *Vec = PoisonValue::get(VecTy);
+ unsigned VecIdx = 0;
+ auto InsertElem = [&](Value *V) {
+ if (V->getType() != VecElemTy)
+ V = Builder.CreateBitOrPointerCast(V, VecElemTy);
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ };
+ for (const ChainElem &E : C) {
+ auto I = cast<StoreInst>(E.Inst);
+ if (FixedVectorType *VT =
+ dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
+ for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
+ InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
+ Builder.getInt32(J)));
+ }
+ } else {
+ InsertElem(I->getValueOperand());
+ }
+ }
+
+ // Chain is in offset order, so C[0] is the instr with the lowest offset,
+ // i.e. the root of the vector.
+ VecInst = Builder.CreateAlignedStore(
+ Vec,
+ Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),
+ VecTy->getPointerTo(AS)),
+ Alignment);
+ }
+
+ propagateMetadata(VecInst, C);
+
+ for (const ChainElem &E : C)
+ ToErase.push_back(E.Inst);
+
+ ++NumVectorInstructions;
+ NumScalarsVectorized += C.size();
+ return true;
+}
+
+template <bool IsLoadChain>
+bool Vectorizer::isSafeToMove(
+ Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
+ LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
+ << *ChainBegin << ")\n");
+
+ assert(isa<LoadInst>(ChainElem) == IsLoadChain);
+ if (ChainElem == ChainBegin)
return true;
- // The above check will not catch the cases where one of the pointers is
- // factorized but the other one is not, such as (C + (S * (A + B))) vs
- // (AS + BS). Get the minus scev. That will allow re-combining the expresions
- // and getting the simplified difference.
- const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
- if (C == Dist)
+ // Invariant loads can always be reordered; by definition they are not
+ // clobbered by stores.
+ if (isInvariantLoad(ChainElem))
return true;
- // Sometimes even this doesn't work, because SCEV can't always see through
- // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
- // things the hard way.
- return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+ auto BBIt = std::next([&] {
+ if constexpr (IsLoadChain)
+ return BasicBlock::reverse_iterator(ChainElem);
+ else
+ return BasicBlock::iterator(ChainElem);
+ }());
+ auto BBItEnd = std::next([&] {
+ if constexpr (IsLoadChain)
+ return BasicBlock::reverse_iterator(ChainBegin);
+ else
+ return BasicBlock::iterator(ChainBegin);
+ }());
+
+ const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
+ const unsigned ChainElemSize =
+ DL.getTypeStoreSize(getLoadStoreType(ChainElem));
+
+ for (; BBIt != BBItEnd; ++BBIt) {
+ Instruction *I = &*BBIt;
+
+ if (!I->mayReadOrWriteMemory())
+ continue;
+
+ // Loads can be reordered with other loads.
+ if (IsLoadChain && isa<LoadInst>(I))
+ continue;
+
+ // Stores can be sunk below invariant loads.
+ if (!IsLoadChain && isInvariantLoad(I))
+ continue;
+
+ // If I is in the chain, we can tell whether it aliases ChainIt by checking
+ // what offset ChainIt accesses. This may be better than AA is able to do.
+ //
+ // We should really only have duplicate offsets for stores (the duplicate
+ // loads should be CSE'ed), but in case we have a duplicate load, we'll
+ // split the chain so we don't have to handle this case specially.
+ if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
+ // I and ChainElem overlap if:
+ // - I and ChainElem have the same offset, OR
+ // - I's offset is less than ChainElem's, but I touches past the
+ // beginning of ChainElem, OR
+ // - ChainElem's offset is less than I's, but ChainElem touches past the
+ // beginning of I.
+ const APInt &IOffset = OffsetIt->second;
+ unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
+ if (IOffset == ChainElemOffset ||
+ (IOffset.sle(ChainElemOffset) &&
+ (IOffset + IElemSize).sgt(ChainElemOffset)) ||
+ (ChainElemOffset.sle(IOffset) &&
+ (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
+ LLVM_DEBUG({
+ // Double check that AA also sees this alias. If not, we probably
+ // have a bug.
+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+ assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
+ dbgs() << "LSV: Found alias in chain: " << *I << "\n";
+ });
+ return false; // We found an aliasing instruction; bail.
+ }
+
+ continue; // We're confident there's no alias.
+ }
+
+ LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+ if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
+ LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
+ << " Aliasing instruction:\n"
+ << " " << *I << '\n'
+ << " Aliased instruction and pointer:\n"
+ << " " << *ChainElem << '\n'
+ << " " << *getLoadStorePointerOperand(ChainElem)
+ << '\n');
+
+ return false;
+ }
+ }
+ return true;
}
static bool checkNoWrapFlags(Instruction *I, bool Signed) {
@@ -395,10 +1097,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) {
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
unsigned MatchingOpIdxA, Instruction *AddOpB,
unsigned MatchingOpIdxB, bool Signed) {
- // If both OpA and OpB is an add with NSW/NUW and with
- // one of the operands being the same, we can guarantee that the
- // transformation is safe if we can prove that OpA won't overflow when
- // IdxDiff added to the other operand of OpA.
+ LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
+ << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
+ << MatchingOpIdxA << ", AddOpB=" << *AddOpB
+ << ", MatchingOpIdxB=" << MatchingOpIdxB
+ << ", Signed=" << Signed << "\n");
+ // If both OpA and OpB are adds with NSW/NUW and with one of the operands
+ // being the same, we can guarantee that the transformation is safe if we can
+ // prove that OpA won't overflow when Ret added to the other operand of OpA.
// For example:
// %tmp7 = add nsw i32 %tmp2, %v0
// %tmp8 = sext i32 %tmp7 to i64
@@ -407,10 +1113,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
// %tmp12 = add nsw i32 %tmp2, %tmp11
// %tmp13 = sext i32 %tmp12 to i64
//
- // Both %tmp7 and %tmp2 has the nsw flag and the first operand
- // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
- // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
- // nsw flag.
+ // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
+ // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
+ // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
assert(AddOpA->getOpcode() == Instruction::Add &&
AddOpB->getOpcode() == Instruction::Add &&
checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
@@ -461,24 +1166,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
return false;
}
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
- APInt PtrDelta,
- unsigned Depth) const {
+std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
+ << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
+ << " Depth=" << Depth << "\n");
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
if (!GEPA || !GEPB)
- return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+ return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
// Look through GEPs after checking they're the same except for the last
// index.
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
GEPA->getPointerOperand() != GEPB->getPointerOperand())
- return false;
+ return std::nullopt;
gep_type_iterator GTIA = gep_type_begin(GEPA);
gep_type_iterator GTIB = gep_type_begin(GEPB);
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
if (GTIA.getOperand() != GTIB.getOperand())
- return false;
+ return std::nullopt;
++GTIA;
++GTIB;
}
@@ -487,23 +1194,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
- return false;
+ return std::nullopt;
- if (PtrDelta.isNegative()) {
- if (PtrDelta.isMinSignedValue())
- return false;
- PtrDelta.negate();
- std::swap(OpA, OpB);
- }
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
- if (PtrDelta.urem(Stride) != 0)
- return false;
- unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
- APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
- return false;
+ return std::nullopt;
bool Signed = isa<SExtInst>(OpA);
@@ -511,7 +1208,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
if (!OpB || ValA->getType() != OpB->getType())
- return false;
+ return std::nullopt;
+
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
+ if (IdxDiffSCEV == SE.getCouldNotCompute())
+ return std::nullopt;
+
+ ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
+ if (!IdxDiffRange.isSingleElement())
+ return std::nullopt;
+ APInt IdxDiff = *IdxDiffRange.getSingleElement();
+
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
+ << "\n");
// Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
@@ -530,10 +1241,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
checkNoWrapFlags(OpB, Signed)) {
- // In the checks below a matching operand in OpA and OpB is
- // an operand which is the same in those two instructions.
- // Below we account for possible orders of the operands of
- // these add instructions.
+ // In the checks below a matching operand in OpA and OpB is an operand which
+ // is the same in those two instructions. Below we account for possible
+ // orders of the operands of these add instructions.
for (unsigned MatchingOpIdxA : {0, 1})
for (unsigned MatchingOpIdxB : {0, 1})
if (!Safe)
@@ -544,802 +1254,267 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
// Third attempt:
- // If all set bits of IdxDiff or any higher order bit other than the sign bit
- // are known to be zero in ValA, we can add Diff to it while guaranteeing no
- // overflow of any sort.
+ //
+ // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
+ // order bit other than the sign bit are known to be zero in ValA, we can add
+ // Diff to it while guaranteeing no overflow of any sort.
+ //
+ // If IdxDiff is negative, do the same, but swap ValA and ValB.
if (!Safe) {
+ // When computing known bits, use the GEPs as context instructions, since
+ // they likely are in the same BB as the load/store.
KnownBits Known(BitWidth);
- computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
+ computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
+ ContextInst, &DT);
APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
if (Signed)
BitsAllowedToBeSet.clearBit(BitWidth - 1);
- if (BitsAllowedToBeSet.ult(IdxDiff))
- return false;
+ if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
+ return std::nullopt;
+ Safe = true;
}
- const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
- const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
- const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
- return X == OffsetSCEVB;
+ if (Safe)
+ return IdxDiff * Stride;
+ return std::nullopt;
}
-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
- const APInt &PtrDelta,
- unsigned Depth) const {
+std::optional<APInt> Vectorizer::getConstantOffsetSelects(
+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
if (Depth++ == MaxDepth)
- return false;
+ return std::nullopt;
if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
- return SelectA->getCondition() == SelectB->getCondition() &&
- areConsecutivePointers(SelectA->getTrueValue(),
- SelectB->getTrueValue(), PtrDelta, Depth) &&
- areConsecutivePointers(SelectA->getFalseValue(),
- SelectB->getFalseValue(), PtrDelta, Depth);
+ if (SelectA->getCondition() != SelectB->getCondition())
+ return std::nullopt;
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
+ << ", PtrB=" << *PtrB << ", ContextInst="
+ << *ContextInst << ", Depth=" << Depth << "\n");
+ std::optional<APInt> TrueDiff = getConstantOffset(
+ SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
+ if (!TrueDiff.has_value())
+ return std::nullopt;
+ std::optional<APInt> FalseDiff =
+ getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
+ ContextInst, Depth);
+ if (TrueDiff == FalseDiff)
+ return TrueDiff;
}
}
- return false;
+ return std::nullopt;
}
-void Vectorizer::reorder(Instruction *I) {
- SmallPtrSet<Instruction *, 16> InstructionsToMove;
- SmallVector<Instruction *, 16> Worklist;
-
- Worklist.push_back(I);
- while (!Worklist.empty()) {
- Instruction *IW = Worklist.pop_back_val();
- int NumOperands = IW->getNumOperands();
- for (int i = 0; i < NumOperands; i++) {
- Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
- if (!IM || IM->getOpcode() == Instruction::PHI)
- continue;
-
- // If IM is in another BB, no need to move it, because this pass only
- // vectorizes instructions within one BB.
- if (IM->getParent() != I->getParent())
- continue;
-
- if (!IM->comesBefore(I)) {
- InstructionsToMove.insert(IM);
- Worklist.push_back(IM);
- }
+EquivalenceClassMap
+Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
+ BasicBlock::iterator End) {
+ EquivalenceClassMap Ret;
+
+ auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
+ const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share
+ // the same condition and evaluate to consecutive pointers for true and
+ // false values of the condition. Therefore using the select's themselves
+ // for grouping instructions would put consecutive accesses into different
+ // lists and they won't be even checked for being consecutive, and won't
+ // be vectorized.
+ return Sel->getCondition();
}
- }
+ return ObjPtr;
+ };
- // All instructions to move should follow I. Start from I, not from begin().
- for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
- ++BBI) {
- if (!InstructionsToMove.count(&*BBI))
+ for (Instruction &I : make_range(Begin, End)) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!LI && !SI)
continue;
- Instruction *IM = &*BBI;
- --BBI;
- IM->removeFromParent();
- IM->insertBefore(I);
- }
-}
-
-std::pair<BasicBlock::iterator, BasicBlock::iterator>
-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
- Instruction *C0 = Chain[0];
- BasicBlock::iterator FirstInstr = C0->getIterator();
- BasicBlock::iterator LastInstr = C0->getIterator();
- BasicBlock *BB = C0->getParent();
- unsigned NumFound = 0;
- for (Instruction &I : *BB) {
- if (!is_contained(Chain, &I))
+ if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
continue;
- ++NumFound;
- if (NumFound == 1) {
- FirstInstr = I.getIterator();
- }
- if (NumFound == Chain.size()) {
- LastInstr = I.getIterator();
- break;
- }
- }
-
- // Range is [first, last).
- return std::make_pair(FirstInstr, ++LastInstr);
-}
-
-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
- SmallVector<Instruction *, 16> Instrs;
- for (Instruction *I : Chain) {
- Value *PtrOperand = getLoadStorePointerOperand(I);
- assert(PtrOperand && "Instruction must have a pointer operand.");
- Instrs.push_back(I);
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
- Instrs.push_back(GEP);
- }
-
- // Erase instructions.
- for (Instruction *I : Instrs)
- if (I->use_empty())
- I->eraseFromParent();
-}
-
-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
- unsigned ElementSizeBits) {
- unsigned ElementSizeBytes = ElementSizeBits / 8;
- unsigned SizeBytes = ElementSizeBytes * Chain.size();
- unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
- if (NumLeft == Chain.size()) {
- if ((NumLeft & 1) == 0)
- NumLeft /= 2; // Split even in half
- else
- --NumLeft; // Split off last element
- } else if (NumLeft == 0)
- NumLeft = 1;
- return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
-}
-
-ArrayRef<Instruction *>
-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
- // These are in BB order, unlike Chain, which is in address order.
- SmallVector<Instruction *, 16> MemoryInstrs;
- SmallVector<Instruction *, 16> ChainInstrs;
-
- bool IsLoadChain = isa<LoadInst>(Chain[0]);
- LLVM_DEBUG({
- for (Instruction *I : Chain) {
- if (IsLoadChain)
- assert(isa<LoadInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- else
- assert(isa<StoreInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- }
- });
-
- for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
- if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
- ChainInstrs.push_back(&I);
+ if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
+ (SI && !TTI.isLegalToVectorizeStore(SI)))
continue;
- }
- if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
- LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
- << I << '\n');
- break;
- }
- if (I.mayReadOrWriteMemory())
- MemoryInstrs.push_back(&I);
- }
-
- // Loop until we find an instruction in ChainInstrs that we can't vectorize.
- unsigned ChainInstrIdx = 0;
- Instruction *BarrierMemoryInstr = nullptr;
-
- for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
- Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
-
- // If a barrier memory instruction was found, chain instructions that follow
- // will not be added to the valid prefix.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
- break;
- // Check (in BB order) if any instruction prevents ChainInstr from being
- // vectorized. Find and store the first such "conflicting" instruction.
- for (Instruction *MemInstr : MemoryInstrs) {
- // If a barrier memory instruction was found, do not check past it.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
- break;
-
- auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
- auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
- if (MemLoad && ChainLoad)
- continue;
-
- // We can ignore the alias if the we have a load store pair and the load
- // is known to be invariant. The load cannot be clobbered by the store.
- auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
- return LI->hasMetadata(LLVMContext::MD_invariant_load);
- };
-
- if (IsLoadChain) {
- // We can ignore the alias as long as the load comes before the store,
- // because that means we won't be moving the load past the store to
- // vectorize it (the vectorized load is inserted at the location of the
- // first load in the chain).
- if (ChainInstr->comesBefore(MemInstr) ||
- (ChainLoad && IsInvariantLoad(ChainLoad)))
- continue;
- } else {
- // Same case, but in reverse.
- if (MemInstr->comesBefore(ChainInstr) ||
- (MemLoad && IsInvariantLoad(MemLoad)))
- continue;
- }
-
- ModRefInfo MR =
- AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
- if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
- LLVM_DEBUG({
- dbgs() << "LSV: Found alias:\n"
- " Aliasing instruction:\n"
- << " " << *MemInstr << '\n'
- << " Aliased instruction and pointer:\n"
- << " " << *ChainInstr << '\n'
- << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
- });
- // Save this aliasing memory instruction as a barrier, but allow other
- // instructions that precede the barrier to be vectorized with this one.
- BarrierMemoryInstr = MemInstr;
- break;
- }
- }
- // Continue the search only for store chains, since vectorizing stores that
- // precede an aliasing load is valid. Conversely, vectorizing loads is valid
- // up to an aliasing store, but should not pull loads from further down in
- // the basic block.
- if (IsLoadChain && BarrierMemoryInstr) {
- // The BarrierMemoryInstr is a store that precedes ChainInstr.
- assert(BarrierMemoryInstr->comesBefore(ChainInstr));
- break;
- }
- }
-
- // Find the largest prefix of Chain whose elements are all in
- // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
- // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
- // order.)
- SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
- ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
- unsigned ChainIdx = 0;
- for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
- if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
- break;
- }
- return Chain.slice(0, ChainIdx);
-}
-
-static ChainID getChainID(const Value *Ptr) {
- const Value *ObjPtr = getUnderlyingObject(Ptr);
- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
- // The select's themselves are distinct instructions even if they share the
- // same condition and evaluate to consecutive pointers for true and false
- // values of the condition. Therefore using the select's themselves for
- // grouping instructions would put consecutive accesses into different lists
- // and they won't be even checked for being consecutive, and won't be
- // vectorized.
- return Sel->getCondition();
- }
- return ObjPtr;
-}
-
-std::pair<InstrListMap, InstrListMap>
-Vectorizer::collectInstructions(BasicBlock *BB) {
- InstrListMap LoadRefs;
- InstrListMap StoreRefs;
-
- for (Instruction &I : *BB) {
- if (!I.mayReadOrWriteMemory())
+ Type *Ty = getLoadStoreType(&I);
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (!LI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeLoad(LI))
- continue;
-
- Type *Ty = LI->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- Value *Ptr = LI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- // Save the load locations.
- const ChainID ID = getChainID(Ptr);
- LoadRefs[ID].push_back(LI);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeStore(SI))
- continue;
-
- Type *Ty = SI->getValueOperand()->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- Value *Ptr = SI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- // Save store location.
- const ChainID ID = getChainID(Ptr);
- StoreRefs[ID].push_back(SI);
- }
- }
-
- return {LoadRefs, StoreRefs};
-}
-
-bool Vectorizer::vectorizeChains(InstrListMap &Map) {
- bool Changed = false;
-
- for (const std::pair<ChainID, InstrList> &Chain : Map) {
- unsigned Size = Chain.second.size();
- if (Size < 2)
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
continue;
- LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
-
- // Process the stores in chunks of 64.
- for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
- unsigned Len = std::min<unsigned>(CE - CI, 64);
- ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
- Changed |= vectorizeInstructions(Chunk);
- }
- }
-
- return Changed;
-}
-
-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
- LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
- << " instructions.\n");
- SmallVector<int, 16> Heads, Tails;
- int ConsecutiveChain[64];
-
- // Do a quadratic search on all of the given loads/stores and find all of the
- // pairs of loads/stores that follow each other.
- for (int i = 0, e = Instrs.size(); i < e; ++i) {
- ConsecutiveChain[i] = -1;
- for (int j = e - 1; j >= 0; --j) {
- if (i == j)
- continue;
-
- if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
- if (ConsecutiveChain[i] != -1) {
- int CurDistance = std::abs(ConsecutiveChain[i] - i);
- int NewDistance = std::abs(ConsecutiveChain[i] - j);
- if (j < i || NewDistance > CurDistance)
- continue; // Should not insert.
- }
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
- Tails.push_back(j);
- Heads.push_back(i);
- ConsecutiveChain[i] = j;
- }
- }
- }
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- bool Changed = false;
- SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
- for (int Head : Heads) {
- if (InstructionsProcessed.count(Instrs[Head]))
+ // Only handle power-of-two sized elements.
+ if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
+ (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
continue;
- bool LongerChainExists = false;
- for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
- if (Head == Tails[TIt] &&
- !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
- LongerChainExists = true;
- break;
- }
- if (LongerChainExists)
- continue;
-
- // We found an instr that starts a chain. Now follow the chain and try to
- // vectorize it.
- SmallVector<Instruction *, 16> Operands;
- int I = Head;
- while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
- if (InstructionsProcessed.count(Instrs[I]))
- break;
-
- Operands.push_back(Instrs[I]);
- I = ConsecutiveChain[I];
- }
- bool Vectorized = false;
- if (isa<LoadInst>(*Operands.begin()))
- Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
- else
- Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
- Changed |= Vectorized;
+ Ret[{getUnderlyingObject(Ptr), AS,
+ DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
+ /*IsLoad=*/LI != nullptr}]
+ .push_back(&I);
}
- return Changed;
+ return Ret;
}
-bool Vectorizer::vectorizeStoreChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- StoreInst *S0 = cast<StoreInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole store.
- Type *StoreTy = nullptr;
- for (Instruction *I : Chain) {
- StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
- if (StoreTy->isIntOrIntVectorTy())
- break;
-
- if (StoreTy->isPtrOrPtrVectorTy()) {
- StoreTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(StoreTy));
- break;
- }
- }
- assert(StoreTy && "Failed to find store type");
+std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
+ if (Instrs.empty())
+ return {};
- unsigned Sz = DL.getTypeSizeInBits(StoreTy);
- unsigned AS = S0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = S0->getAlign();
+ unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
+ unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
+#ifndef NDEBUG
+ // Check that Instrs is in BB order and all have the same addr space.
+ for (size_t I = 1; I < Instrs.size(); ++I) {
+ assert(Instrs[I - 1]->comesBefore(Instrs[I]));
+ assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
}
+#endif
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
-
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
-
- FixedVectorType *VecTy;
- auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
- if (VecStoreTy)
- VecTy = FixedVectorType::get(StoreTy->getScalarType(),
- Chain.size() * VecStoreTy->getNumElements());
- else
- VecTy = FixedVectorType::get(StoreTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- bool Vectorized = false;
- Vectorized |=
- vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
- Vectorized |=
- vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
- return Vectorized;
- }
-
- LLVM_DEBUG({
- dbgs() << "LSV: Stores to vectorize:\n";
- for (Instruction *I : Chain)
- dbgs() << " " << *I << "\n";
- });
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the store is going to be misaligned, don't vectorize it.
- unsigned RelativeSpeed;
- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
- if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- unsigned SpeedBefore;
- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
- if (SpeedBefore > RelativeSpeed)
- return false;
-
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
- return Vectorized;
+ // Machinery to build an MRU-hashtable of Chains.
+ //
+ // (Ideally this could be done with MapVector, but as currently implemented,
+ // moving an element to the front of a MapVector is O(n).)
+ struct InstrListElem : ilist_node<InstrListElem>,
+ std::pair<Instruction *, Chain> {
+ explicit InstrListElem(Instruction *I)
+ : std::pair<Instruction *, Chain>(I, {}) {}
+ };
+ struct InstrListElemDenseMapInfo {
+ using PtrInfo = DenseMapInfo<InstrListElem *>;
+ using IInfo = DenseMapInfo<Instruction *>;
+ static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
+ static InstrListElem *getTombstoneKey() {
+ return PtrInfo::getTombstoneKey();
}
-
- Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, S0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
- }
-
- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
- return Vectorized;
- }
-
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*Last);
-
- Value *Vec = PoisonValue::get(VecTy);
-
- if (VecStoreTy) {
- unsigned VecWidth = VecStoreTy->getNumElements();
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
- unsigned NewIdx = J + I * VecWidth;
- Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
- Builder.getInt32(J));
- if (Extract->getType() != StoreTy->getScalarType())
- Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
- Vec = Insert;
- }
+ static unsigned getHashValue(const InstrListElem *E) {
+ return IInfo::getHashValue(E->first);
}
- } else {
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- Value *Extract = Store->getValueOperand();
- if (Extract->getType() != StoreTy->getScalarType())
- Extract =
- Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
- Vec = Insert;
+ static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
+ if (A == getEmptyKey() || B == getEmptyKey())
+ return A == getEmptyKey() && B == getEmptyKey();
+ if (A == getTombstoneKey() || B == getTombstoneKey())
+ return A == getTombstoneKey() && B == getTombstoneKey();
+ return IInfo::isEqual(A->first, B->first);
}
- }
-
- StoreInst *SI = Builder.CreateAlignedStore(
- Vec,
- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
- Alignment);
- propagateMetadata(SI, Chain);
-
- eraseInstructions(Chain);
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::vectorizeLoadChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- LoadInst *L0 = cast<LoadInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole load.
- Type *LoadTy = nullptr;
- for (const auto &V : Chain) {
- LoadTy = cast<LoadInst>(V)->getType();
- if (LoadTy->isIntOrIntVectorTy())
- break;
-
- if (LoadTy->isPtrOrPtrVectorTy()) {
- LoadTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(LoadTy));
- break;
+ };
+ SpecificBumpPtrAllocator<InstrListElem> Allocator;
+ simple_ilist<InstrListElem> MRU;
+ DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
+
+ // Compare each instruction in `instrs` to leader of the N most recently-used
+ // chains. This limits the O(n^2) behavior of this pass while also allowing
+ // us to build arbitrarily long chains.
+ for (Instruction *I : Instrs) {
+ constexpr int MaxChainsToTry = 64;
+
+ bool MatchFound = false;
+ auto ChainIter = MRU.begin();
+ for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
+ ++J, ++ChainIter) {
+ std::optional<APInt> Offset = getConstantOffset(
+ getLoadStorePointerOperand(ChainIter->first),
+ getLoadStorePointerOperand(I),
+ /*ContextInst=*/
+ (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
+ if (Offset.has_value()) {
+ // `Offset` might not have the expected number of bits, if e.g. AS has a
+ // different number of bits than opaque pointers.
+ ChainIter->second.push_back(ChainElem{I, Offset.value()});
+ // Move ChainIter to the front of the MRU list.
+ MRU.remove(*ChainIter);
+ MRU.push_front(*ChainIter);
+ MatchFound = true;
+ break;
+ }
}
- }
- assert(LoadTy && "Can't determine LoadInst type from chain");
-
- unsigned Sz = DL.getTypeSizeInBits(LoadTy);
- unsigned AS = L0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = L0->getAlign();
-
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
-
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
- VectorType *VecTy;
- auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
- if (VecLoadTy)
- VecTy = FixedVectorType::get(LoadTy->getScalarType(),
- Chain.size() * VecLoadTy->getNumElements());
- else
- VecTy = FixedVectorType::get(LoadTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- bool Vectorized = false;
- Vectorized |=
- vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
- Vectorized |=
- vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
- return Vectorized;
- }
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the load is going to be misaligned, don't vectorize it.
- unsigned RelativeSpeed;
- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
- if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- unsigned SpeedBefore;
- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
- if (SpeedBefore > RelativeSpeed)
- return false;
-
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
- return Vectorized;
+ if (!MatchFound) {
+ APInt ZeroOffset(ASPtrBits, 0);
+ InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
+ E->second.push_back(ChainElem{I, ZeroOffset});
+ MRU.push_front(*E);
+ Chains.insert(E);
}
-
- Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, L0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
}
- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
- return Vectorized;
- }
+ std::vector<Chain> Ret;
+ Ret.reserve(Chains.size());
+ // Iterate over MRU rather than Chains so the order is deterministic.
+ for (auto &E : MRU)
+ if (E.second.size() > 1)
+ Ret.push_back(std::move(E.second));
+ return Ret;
+}
- LLVM_DEBUG({
- dbgs() << "LSV: Loads to vectorize:\n";
- for (Instruction *I : Chain)
- I->dump();
- });
+std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth) {
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
+ << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
+ << ", Depth=" << Depth << "\n");
+ // We'll ultimately return a value of this bit width, even if computations
+ // happen in a different width.
+ unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(OrigBitWidth, 0);
+ APInt OffsetB(OrigBitWidth, 0);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ return std::nullopt;
- // getVectorizablePrefix already computed getBoundaryInstrs. The value of
- // Last may have changed since then, but the value of First won't have. If it
- // matters, we could compute getBoundaryInstrs only once and reuse it here.
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*First);
-
- Value *Bitcast =
- Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
- LoadInst *LI =
- Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
- propagateMetadata(LI, Chain);
-
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- Value *CV = Chain[I];
- Value *V;
- if (VecLoadTy) {
- // Extract a subvector using shufflevector.
- unsigned VecWidth = VecLoadTy->getNumElements();
- auto Mask =
- llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
- V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
- } else {
- V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
- }
+ // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
+ // should properly handle a possible overflow and the value should fit into
+ // the smallest data type used in the cast/gep chain.
+ assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
+ OffsetB.getSignificantBits() <= NewPtrBitWidth);
- if (V->getType() != CV->getType()) {
- V = Builder.CreateBitOrPointerCast(V, CV->getType());
+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+ if (PtrA == PtrB)
+ return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
+
+ // Try to compute B - A.
+ const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
+ if (DistScev != SE.getCouldNotCompute()) {
+ LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
+ ConstantRange DistRange = SE.getSignedRange(DistScev);
+ if (DistRange.isSingleElement()) {
+ // Handle index width (the width of Dist) != pointer width (the width of
+ // the Offset*s at this point).
+ APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
+ return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
}
-
- // Replace the old instruction.
- CV->replaceAllUsesWith(V);
}
-
- // Since we might have opaque pointers we might end up using the pointer
- // operand of the first load (wrt. memory loaded) for the vector load. Since
- // this first load might not be the first in the block we potentially need to
- // reorder the pointer operand (and its operands). If we have a bitcast though
- // it might be before the load and should be the reorder start instruction.
- // "Might" because for opaque pointers the "bitcast" is just the first loads
- // pointer operand, as oppposed to something we inserted at the right position
- // ourselves.
- Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
- reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
-
- eraseInstructions(Chain);
-
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- Align Alignment, unsigned &RelativeSpeed) {
- RelativeSpeed = 0;
- if (Alignment.value() % SzInBytes == 0)
- return false;
-
- bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
- SzInBytes * 8, AddressSpace,
- Alignment, &RelativeSpeed);
- LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
- << " with relative speed = " << RelativeSpeed << '\n';);
- return !Allows || !RelativeSpeed;
+ std::optional<APInt> Diff =
+ getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
+ if (Diff.has_value())
+ return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
+ .sextOrTrunc(OrigBitWidth);
+ return std::nullopt;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cd48c0d57eb3..f923f0be6621 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -37,6 +37,11 @@ static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
+static cl::opt<bool>
+AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
+ cl::desc("Enable recognition of non-constant strided "
+ "pointer induction variables."));
+
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -447,8 +452,12 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
Value *Ptr) const {
- const ValueToValueMap &Strides =
- getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+ // FIXME: Currently, the set of symbolic strides is sometimes queried before
+ // it's collected. This happens from canVectorizeWithIfConvert, when the
+ // pointer is checked to reference consecutive elements suitable for a
+ // masked access.
+ const auto &Strides =
+ LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>();
Function *F = TheLoop->getHeader()->getParent();
bool OptForSize = F->hasOptSize() ||
@@ -462,11 +471,135 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
return 0;
}
-bool LoopVectorizationLegality::isUniform(Value *V) const {
- return LAI->isUniform(V);
+bool LoopVectorizationLegality::isInvariant(Value *V) const {
+ return LAI->isInvariant(V);
+}
+
+namespace {
+/// A rewriter to build the SCEVs for each of the VF lanes in the expected
+/// vectorized loop, which can then be compared to detect their uniformity. This
+/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
+/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
+/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
+/// uniformity.
+class SCEVAddRecForUniformityRewriter
+ : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
+ /// Multiplier to be applied to the step of AddRecs in TheLoop.
+ unsigned StepMultiplier;
+
+ /// Offset to be added to the AddRecs in TheLoop.
+ unsigned Offset;
+
+ /// Loop for which to rewrite AddRecsFor.
+ Loop *TheLoop;
+
+ /// Is any sub-expressions not analyzable w.r.t. uniformity?
+ bool CannotAnalyze = false;
+
+ bool canAnalyze() const { return !CannotAnalyze; }
+
+public:
+ SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
+ unsigned Offset, Loop *TheLoop)
+ : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
+ TheLoop(TheLoop) {}
+
+ const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+ assert(Expr->getLoop() == TheLoop &&
+ "addrec outside of TheLoop must be invariant and should have been "
+ "handled earlier");
+ // Build a new AddRec by multiplying the step by StepMultiplier and
+ // incrementing the start by Offset * step.
+ Type *Ty = Expr->getType();
+ auto *Step = Expr->getStepRecurrence(SE);
+ if (!SE.isLoopInvariant(Step, TheLoop)) {
+ CannotAnalyze = true;
+ return Expr;
+ }
+ auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
+ auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
+ auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
+ return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
+ }
+
+ const SCEV *visit(const SCEV *S) {
+ if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
+ return S;
+ return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
+ }
+
+ const SCEV *visitUnknown(const SCEVUnknown *S) {
+ if (SE.isLoopInvariant(S, TheLoop))
+ return S;
+ // The value could vary across iterations.
+ CannotAnalyze = true;
+ return S;
+ }
+
+ const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
+ // Could not analyze the expression.
+ CannotAnalyze = true;
+ return S;
+ }
+
+ static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
+ unsigned StepMultiplier, unsigned Offset,
+ Loop *TheLoop) {
+ /// Bail out if the expression does not contain an UDiv expression.
+ /// Uniform values which are not loop invariant require operations to strip
+ /// out the lowest bits. For now just look for UDivs and use it to avoid
+ /// re-writing UDIV-free expressions for other lanes to limit compile time.
+ if (!SCEVExprContains(S,
+ [](const SCEV *S) { return isa<SCEVUDivExpr>(S); }))
+ return SE.getCouldNotCompute();
+
+ SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
+ TheLoop);
+ const SCEV *Result = Rewriter.visit(S);
+
+ if (Rewriter.canAnalyze())
+ return Result;
+ return SE.getCouldNotCompute();
+ }
+};
+
+} // namespace
+
+bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
+ if (isInvariant(V))
+ return true;
+ if (VF.isScalable())
+ return false;
+ if (VF.isScalar())
+ return true;
+
+ // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
+ // never considered uniform.
+ auto *SE = PSE.getSE();
+ if (!SE->isSCEVable(V->getType()))
+ return false;
+ const SCEV *S = SE->getSCEV(V);
+
+ // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
+ // lane 0 matches the expressions for all other lanes.
+ unsigned FixedVF = VF.getKnownMinValue();
+ const SCEV *FirstLaneExpr =
+ SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
+ if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
+ return false;
+
+ // Make sure the expressions for lanes FixedVF-1..1 match the expression for
+ // lane 0. We check lanes in reverse order for compile-time, as frequently
+ // checking the last lane is sufficient to rule out uniformity.
+ return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) {
+ const SCEV *IthLaneExpr =
+ SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop);
+ return FirstLaneExpr == IthLaneExpr;
+ });
}
-bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
+bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
+ ElementCount VF) const {
Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
return false;
@@ -474,7 +607,7 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
// stores from being uniform. The current lowering simply doesn't handle
// it; in particular, the cost model distinguishes scatter/gather from
// scalar w/predication, and we currently rely on the scalar path.
- return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
+ return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
}
bool LoopVectorizationLegality::canVectorizeOuterLoop() {
@@ -700,6 +833,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
+ // We prevent matching non-constant strided pointer IVS to preserve
+ // historical vectorizer behavior after a generalization of the
+ // IVDescriptor code. The intent is to remove this check, but we
+ // have to fix issues around code quality for such loops first.
+ auto isDisallowedStridedPointerInduction =
+ [](const InductionDescriptor &ID) {
+ if (AllowStridedPointerIVs)
+ return false;
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ ID.getConstIntStepValue() == nullptr;
+ };
+
// TODO: Instead of recording the AllowedExit, it would be good to
// record the complementary set: NotAllowedExit. These include (but may
// not be limited to):
@@ -715,14 +860,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// By recording these, we can then reason about ways to vectorize each
// of these NotAllowedExit.
InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
+ !isDisallowedStridedPointerInduction(ID)) {
addInductionPhi(Phi, ID, AllowedExit);
Requirements->addExactFPMathInst(ID.getExactFPMathInst());
continue;
}
- if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop,
- SinkAfter, DT)) {
+ if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
AllowedExit.insert(Phi);
FixedOrderRecurrences.insert(Phi);
continue;
@@ -730,7 +875,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// As a last resort, coerce the PHI to a AddRec expression
// and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
+ !isDisallowedStridedPointerInduction(ID)) {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
@@ -894,18 +1040,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
- // For fixed order recurrences, we use the previous value (incoming value from
- // the latch) to check if it dominates all users of the recurrence. Bail out
- // if we have to sink such an instruction for another recurrence, as the
- // dominance requirement may not hold after sinking.
- BasicBlock *LoopLatch = TheLoop->getLoopLatch();
- if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
- Instruction *V =
- cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
- return SinkAfter.find(V) != SinkAfter.end();
- }))
- return false;
-
// Now we know the widest induction type, check if our found induction
// is the same size. If it's not, unset it here and InnerLoopVectorizer
// will create another.
@@ -1124,6 +1258,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(
if (isa<NoAliasScopeDeclInst>(&I))
continue;
+ // We can allow masked calls if there's at least one vector variant, even
+ // if we end up scalarizing due to the cost model calculations.
+ // TODO: Allow other calls if they have appropriate attributes... readonly
+ // and argmemonly?
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (VFDatabase::hasMaskedVariant(*CI)) {
+ MaskedOp.insert(CI);
+ continue;
+ }
+
// Loads are handled via masking (or speculated if safe to do so.)
if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (!SafePtrs.count(LI->getPointerOperand()))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8990a65afdb4..13357cb06c55 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -25,6 +25,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
#include "VPlan.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/InstructionCost.h"
namespace llvm {
@@ -217,6 +218,16 @@ struct VectorizationFactor {
}
};
+/// ElementCountComparator creates a total ordering for ElementCount
+/// for the purposes of using it in a set structure.
+struct ElementCountComparator {
+ bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
+ return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+ std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
+ }
+};
+using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+
/// A class that represents two vectorization factors (initialized with 0 by
/// default). One for fixed-width vectorization and one for scalable
/// vectorization. This can be used by the vectorizer to choose from a range of
@@ -261,7 +272,7 @@ class LoopVectorizationPlanner {
const TargetLibraryInfo *TLI;
/// Target Transform Info.
- const TargetTransformInfo *TTI;
+ const TargetTransformInfo &TTI;
/// The legality analysis.
LoopVectorizationLegality *Legal;
@@ -280,12 +291,15 @@ class LoopVectorizationPlanner {
SmallVector<VPlanPtr, 4> VPlans;
+ /// Profitable vector factors.
+ SmallVector<VectorizationFactor, 8> ProfitableVFs;
+
/// A builder used to construct the current plan.
VPBuilder Builder;
public:
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI,
+ const TargetTransformInfo &TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
InterleavedAccessInfo &IAI,
@@ -311,16 +325,22 @@ public:
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
/// vectorization re-using plans for both the main and epilogue vector loops.
/// It should be removed once the re-use issue has been fixed.
- void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
- InnerLoopVectorizer &LB, DominatorTree *DT,
- bool IsEpilogueVectorization);
+ /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
+ /// to re-use expansion results generated during main plan execution. Returns
+ /// a mapping of SCEVs to their expanded IR values. Note that this is a
+ /// temporary workaround needed due to the current epilogue handling.
+ DenseMap<const SCEV *, Value *>
+ executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
+ InnerLoopVectorizer &LB, DominatorTree *DT,
+ bool IsEpilogueVectorization,
+ DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printPlans(raw_ostream &O);
#endif
- /// Look through the existing plans and return true if we have one with all
- /// the vectorization factors in question.
+ /// Look through the existing plans and return true if we have one with
+ /// vectorization factor \p VF.
bool hasPlanWithVF(ElementCount VF) const {
return any_of(VPlans,
[&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
@@ -333,8 +353,11 @@ public:
getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
VFRange &Range);
- /// Check if the number of runtime checks exceeds the threshold.
- bool requiresTooManyRuntimeChecks() const;
+ /// \return The most profitable vectorization factor and the cost of that VF
+ /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if
+ /// epilogue vectorization is not supported for the loop.
+ VectorizationFactor
+ selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC);
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
@@ -350,9 +373,12 @@ private:
/// Build a VPlan using VPRecipes according to the information gather by
/// Legal. This method is only used for the legacy inner loop vectorizer.
- VPlanPtr buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const MapVector<Instruction *, Instruction *> &SinkAfter);
+ /// \p Range's largest included VF is restricted to the maximum VF the
+ /// returned VPlan is valid for. If no VPlan can be built for the input range,
+ /// set the largest included VF to the maximum VF for which no plan could be
+ /// built.
+ std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
@@ -367,6 +393,20 @@ private:
void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
+
+ /// \return The most profitable vectorization factor and the cost of that VF.
+ /// This method checks every VF in \p CandidateVFs.
+ VectorizationFactor
+ selectVectorizationFactor(const ElementCountSet &CandidateVFs);
+
+ /// Returns true if the per-lane cost of VectorizationFactor A is lower than
+ /// that of B.
+ bool isMoreProfitable(const VectorizationFactor &A,
+ const VectorizationFactor &B) const;
+
+ /// Determines if we have the infrastructure to vectorize the loop and its
+ /// epilogue, assuming the main loop is vectorized by \p VF.
+ bool isCandidateForEpilogueVectorization(const ElementCount VF) const;
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a28099d8ba7d..d7e40e8ef978 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -98,6 +98,7 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
@@ -120,8 +121,6 @@
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
@@ -231,6 +230,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
"prefers tail-folding, don't attempt vectorization if "
"tail-folding fails.")));
+static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
+ "force-tail-folding-style", cl::desc("Force the tail folding style"),
+ cl::init(TailFoldingStyle::None),
+ cl::values(
+ clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
+ clEnumValN(
+ TailFoldingStyle::Data, "data",
+ "Create lane mask for data only, using active.lane.mask intrinsic"),
+ clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
+ "data-without-lane-mask",
+ "Create lane mask with compare/stepvector"),
+ clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
+ "Create lane mask using active.lane.mask intrinsic, and use "
+ "it for both data and control flow"),
+ clEnumValN(
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+ "data-and-control-without-rt-check",
+ "Similar to data-and-control, but remove the runtime check")));
+
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -338,10 +356,12 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
+namespace llvm {
cl::opt<bool> EnableVPlanNativePath(
- "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ "enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
+}
// This flag enables the stress testing of the VPlan H-CFG construction in the
// VPlan-native vectorization path. It must be used in conjuction with
@@ -419,9 +439,42 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
return std::nullopt;
}
+/// Return a vector containing interleaved elements from multiple
+/// smaller input vectors.
+static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
+ const Twine &Name) {
+ unsigned Factor = Vals.size();
+ assert(Factor > 1 && "Tried to interleave invalid number of vectors");
+
+ VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
+#ifndef NDEBUG
+ for (Value *Val : Vals)
+ assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
+#endif
+
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
+ // must use intrinsics to interleave.
+ if (VecTy->isScalableTy()) {
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ return Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
+ /*FMFSource=*/nullptr, Name);
+ }
+
+ // Fixed length. Start by concatenating all vectors into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, Vals);
+
+ // Interleave the elements into the wide vector.
+ const unsigned NumElts = VecTy->getElementCount().getFixedValue();
+ return Builder.CreateShuffleVector(
+ WideVec, createInterleaveMask(NumElts, Factor), Name);
+}
+
namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;
+
+using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
} // namespace
namespace llvm {
@@ -477,8 +530,10 @@ public:
/// loop and the start value for the canonical induction, if it is != 0. The
/// latter is the case when vectorizing the epilogue loop. In the case of
/// epilogue vectorization, this function is overriden to handle the more
- /// complex control flow around the loops.
- virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
+ /// complex control flow around the loops. \p ExpandedSCEVs is used to
+ /// look up SCEV expansions for expressions needed during skeleton creation.
+ virtual std::pair<BasicBlock *, Value *>
+ createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -498,7 +553,7 @@ public:
/// Instr's operands.
void scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
- const VPIteration &Instance, bool IfPredicateInstr,
+ const VPIteration &Instance,
VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
@@ -513,7 +568,7 @@ public:
ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr,
ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask = nullptr);
+ VPValue *BlockInMask, bool NeedsMaskForGaps);
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
@@ -522,28 +577,30 @@ public:
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
- /// Create a broadcast instruction. This method generates a broadcast
- /// instruction (shuffle) for loop invariant values and for the induction
- /// value. If this is the induction variable then we extend it to N, N+1, ...
- /// this is needed because each iteration in the loop corresponds to a SIMD
- /// element.
- virtual Value *getBroadcastInstrs(Value *V);
-
// Returns the resume value (bc.merge.rdx) for a reduction as
// generated by fixReduction.
PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
- /// left off. In cases where the loop skeleton is more complicated (eg.
- /// epilogue vectorization) and the resume values can come from an additional
- /// bypass block, the \p AdditionalBypass pair provides information about the
- /// bypass block and the end value on the edge from bypass to this loop.
+ /// left off. \p Step is the SCEV-expanded induction step to use. In cases
+ /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
+ /// and the resume values can come from an additional bypass block, the \p
+ /// AdditionalBypass pair provides information about the bypass block and the
+ /// end value on the edge from bypass to this loop.
PHINode *createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &ID,
+ PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+ /// Returns the original loop trip count.
+ Value *getTripCount() const { return TripCount; }
+
+ /// Used to set the trip count after ILV's construction and after the
+ /// preheader block has been executed. Note that this always holds the trip
+ /// count of the original loop for both main loop and epilogue vectorization.
+ void setTripCount(Value *TC) { TripCount = TC; }
+
protected:
friend class LoopVectorizationPlanner;
@@ -560,7 +617,7 @@ protected:
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
- VPlan &Plan);
+ VPlan &Plan, VPTransformState &State);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
@@ -573,10 +630,6 @@ protected:
/// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
- /// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
- VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -585,9 +638,6 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// Returns (and creates if needed) the original loop trip count.
- Value *getOrCreateTripCount(BasicBlock *InsertBlock);
-
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
@@ -621,6 +671,7 @@ protected:
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
+ const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
@@ -758,9 +809,6 @@ public:
ElementCount::getFixed(1),
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
BFI, PSI, Check) {}
-
-private:
- Value *getBroadcastInstrs(Value *V) override;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -810,15 +858,16 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
- return createEpilogueVectorizedLoopSkeleton();
+ std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) final {
+ return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
virtual std::pair<BasicBlock *, Value *>
- createEpilogueVectorizedLoopSkeleton() = 0;
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -846,7 +895,8 @@ public:
EPI, LVL, CM, BFI, PSI, Check) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -876,7 +926,8 @@ public:
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
/// Emits an iteration count bypass check after the main vector loop has
@@ -953,35 +1004,21 @@ namespace llvm {
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
- Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
- return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+ return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
}
/// Return the runtime value for VF.
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
- Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
- return VF.isScalable() ? B.CreateVScale(EC) : EC;
+ return B.CreateElementCount(Ty, VF);
}
-const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
+ Loop *OrigLoop) {
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
-
- // The exit count might have the type of i64 while the phi is i32. This can
- // happen if we have an induction variable that is sign extended before the
- // compare. The only way that we get a backedge taken count is that the
- // induction variable was signed and as such will not overflow. In such a case
- // truncation is legal.
- if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
- IdxTy->getPrimitiveSizeInBits())
- BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
- BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
-
- // Get the total trip count from the count by adding 1.
- return SE.getAddExpr(BackedgeTakenCount,
- SE.getOne(BackedgeTakenCount->getType()));
+ return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
}
static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
@@ -1062,11 +1099,17 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
continue;
// This recipe contributes to the address computation of a widen
- // load/store. Collect recipe if its underlying instruction has
- // poison-generating flags.
- Instruction *Instr = CurRec->getUnderlyingInstr();
- if (Instr && Instr->hasPoisonGeneratingFlags())
- State.MayGeneratePoisonRecipes.insert(CurRec);
+ // load/store. If the underlying instruction has poison-generating flags,
+ // drop them directly.
+ if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+ RecWithFlags->dropPoisonGeneratingFlags();
+ } else {
+ Instruction *Instr = CurRec->getUnderlyingInstr();
+ (void)Instr;
+ assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
+ "found instruction with poison generating flags not covered by "
+ "VPRecipeWithIRFlags");
+ }
// Add new definitions to the worklist.
for (VPValue *operand : CurRec->operands())
@@ -1143,15 +1186,7 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};
-/// ElementCountComparator creates a total ordering for ElementCount
-/// for the purposes of using it in a set structure.
-struct ElementCountComparator {
- bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
- return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
- std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
- }
-};
-using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+using InstructionVFPair = std::pair<Instruction *, ElementCount>;
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
@@ -1184,17 +1219,6 @@ public:
/// otherwise.
bool runtimeChecksRequired();
- /// \return The most profitable vectorization factor and the cost of that VF.
- /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
- /// then this vectorization factor will be selected if vectorization is
- /// possible.
- VectorizationFactor
- selectVectorizationFactor(const ElementCountSet &CandidateVFs);
-
- VectorizationFactor
- selectEpilogueVectorizationFactor(const ElementCount MaxVF,
- const LoopVectorizationPlanner &LVP);
-
/// Setup cost-based decisions for user vectorization factor.
/// \return true if the UserVF is a feasible VF to be chosen.
bool selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1278,11 +1302,17 @@ public:
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
- return Scalars->second.find(I) != Scalars->second.end();
+ return Scalars->second.contains(I);
}
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ // Pseudo probe needs to be duplicated for each unrolled iteration and
+ // vector lane so that profiled loop trip count can be accurately
+ // accumulated instead of being under counted.
+ if (isa<PseudoProbeInst>(I))
+ return false;
+
if (VF.isScalar())
return true;
@@ -1316,7 +1346,7 @@ public:
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
- return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
+ return VF.isVector() && MinBWs.contains(I) &&
!isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
@@ -1379,7 +1409,7 @@ public:
InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF >=2");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
- assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+ assert(WideningDecisions.contains(InstOnVF) &&
"The cost is not calculated");
return WideningDecisions[InstOnVF].second;
}
@@ -1419,7 +1449,7 @@ public:
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
- if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
+ if (VF.isScalar() || Uniforms.contains(VF))
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
@@ -1442,8 +1472,7 @@ public:
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
- bool isLegalGatherOrScatter(Value *V,
- ElementCount VF = ElementCount::getFixed(1)) {
+ bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
@@ -1522,14 +1551,29 @@ public:
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue(ElementCount VF) const {
+ bool requiresScalarEpilogue(bool IsVectorizing) const {
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
- return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
+ return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
+ }
+
+ /// Returns true if we're required to use a scalar epilogue for at least
+ /// the final iteration of the original loop for all VFs in \p Range.
+ /// A scalar epilogue must either be required for all VFs in \p Range or for
+ /// none.
+ bool requiresScalarEpilogue(VFRange Range) const {
+ auto RequiresScalarEpilogue = [this](ElementCount VF) {
+ return requiresScalarEpilogue(VF.isVector());
+ };
+ bool IsRequired = all_of(Range, RequiresScalarEpilogue);
+ assert(
+ (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
+ "all VFs in range must agree on whether a scalar epilogue is required");
+ return IsRequired;
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1538,14 +1582,21 @@ public:
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
}
- /// Returns true if all loop blocks should be masked to fold tail loop.
- bool foldTailByMasking() const { return FoldTailByMasking; }
+ /// Returns the TailFoldingStyle that is best for the current loop.
+ TailFoldingStyle
+ getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+ if (!CanFoldTailByMasking)
+ return TailFoldingStyle::None;
+
+ if (ForceTailFoldingStyle.getNumOccurrences())
+ return ForceTailFoldingStyle;
+
+ return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+ }
- /// Returns true if were tail-folding and want to use the active lane mask
- /// for vector loop control flow.
- bool useActiveLaneMaskForControlFlow() const {
- return FoldTailByMasking &&
- TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const {
+ return getTailFoldingStyle() != TailFoldingStyle::None;
}
/// Returns true if the instructions in this block requires predication
@@ -1582,12 +1633,8 @@ public:
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize) const;
-
- /// Returns true if the per-lane cost of VectorizationFactor A is lower than
- /// that of B.
- bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B) const;
+ Function **Variant,
+ bool *NeedsMask = nullptr) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
@@ -1596,10 +1643,29 @@ public:
Scalars.clear();
}
- /// Convenience function that returns the value of vscale_range iff
- /// vscale_range.min == vscale_range.max or otherwise returns the value
- /// returned by the corresponding TLI method.
- std::optional<unsigned> getVScaleForTuning() const;
+ /// The vectorization cost is a combination of the cost itself and a boolean
+ /// indicating whether any of the contributing operations will actually
+ /// operate on vector values after type legalization in the backend. If this
+ /// latter value is false, then all operations will be scalarized (i.e. no
+ /// vectorization has actually taken place).
+ using VectorizationCostTy = std::pair<InstructionCost, bool>;
+
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width. If \p Invalid is not nullptr, this function
+ /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+ /// each instruction that has an Invalid cost for the given VF.
+ VectorizationCostTy
+ expectedCost(ElementCount VF,
+ SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+
+ bool hasPredStores() const { return NumPredStores > 0; }
+
+ /// Returns true if epilogue vectorization is considered profitable, and
+ /// false otherwise.
+ /// \p VF is the vectorization factor chosen for the original loop.
+ bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
private:
unsigned NumPredStores = 0;
@@ -1626,24 +1692,6 @@ private:
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
- /// The vectorization cost is a combination of the cost itself and a boolean
- /// indicating whether any of the contributing operations will actually
- /// operate on vector values after type legalization in the backend. If this
- /// latter value is false, then all operations will be scalarized (i.e. no
- /// vectorization has actually taken place).
- using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
- /// Returns the expected execution cost. The unit of the cost does
- /// not matter because we use the 'cost' units to compare different
- /// vector widths. The cost that is returned is *not* normalized by
- /// the factor width. If \p Invalid is not nullptr, this function
- /// will add a pair(Instruction*, ElementCount) to \p Invalid for
- /// each instruction that has an Invalid cost for the given VF.
- using InstructionVFPair = std::pair<Instruction *, ElementCount>;
- VectorizationCostTy
- expectedCost(ElementCount VF,
- SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
-
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1715,7 +1763,7 @@ private:
ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
/// All blocks of loop are to be masked to fold tail of scalar iterations.
- bool FoldTailByMasking = false;
+ bool CanFoldTailByMasking = false;
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
@@ -1796,8 +1844,7 @@ private:
// the scalars are collected. That should be a safe assumption in most
// cases, because we check if the operands have vectorizable types
// beforehand in LoopVectorizationLegality.
- return Scalars.find(VF) == Scalars.end() ||
- !isScalarAfterVectorization(I, VF);
+ return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
};
/// Returns a range containing only operands needing to be extracted.
@@ -1807,16 +1854,6 @@ private:
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
- /// Determines if we have the infrastructure to vectorize loop \p L and its
- /// epilogue, assuming the main loop is vectorized by \p VF.
- bool isCandidateForEpilogueVectorization(const Loop &L,
- const ElementCount VF) const;
-
- /// Returns true if epilogue vectorization is considered profitable, and
- /// false otherwise.
- /// \p VF is the vectorization factor chosen for the original loop.
- bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -1862,9 +1899,6 @@ public:
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
-
- /// Profitable vector factors.
- SmallVector<VectorizationFactor, 8> ProfitableVFs;
};
} // end namespace llvm
@@ -2135,6 +2169,17 @@ public:
};
} // namespace
+static bool useActiveLaneMask(TailFoldingStyle Style) {
+ return Style == TailFoldingStyle::Data ||
+ Style == TailFoldingStyle::DataAndControlFlow ||
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+}
+
+static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
+ return Style == TailFoldingStyle::DataAndControlFlow ||
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2202,97 +2247,11 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
collectSupportedLoops(*InnerL, LI, ORE, V);
}
-namespace {
-
-/// The LoopVectorize Pass.
-struct LoopVectorize : public FunctionPass {
- /// Pass identification, replacement for typeid
- static char ID;
-
- LoopVectorizePass Impl;
-
- explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
- bool VectorizeOnlyWhenForced = false)
- : FunctionPass(ID),
- Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
- initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
- return Impl
- .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
- .MadeAnyChange;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
-
- // We currently do not preserve loopinfo/dominator analyses with outer loop
- // vectorization. Until this is addressed, mark these analyses as preserved
- // only for non-VPlan-native path.
- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
- if (!EnableVPlanNativePath) {
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//
-Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // We need to place the broadcast of invariant variables outside the loop,
- // but only if it's proven safe to do so. Else, broadcast will be inside
- // vector loop body.
- Instruction *Instr = dyn_cast<Instruction>(V);
- bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
- (!Instr ||
- DT->dominates(Instr->getParent(), LoopVectorPreHeader));
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-
- // Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
- return Shuf;
-}
-
/// This function adds
/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
@@ -2435,21 +2394,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
}
}
-// Generate code for the induction step. Note that induction steps are
-// required to be loop-invariant
-static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
- Instruction *InsertBefore,
- Loop *OrigLoop = nullptr) {
- const DataLayout &DL = SE.getDataLayout();
- assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
- "Induction step should be loop invariant");
- if (auto *E = dyn_cast<SCEVUnknown>(Step))
- return E->getValue();
-
- SCEVExpander Exp(SE, DL, "induction");
- return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
-}
-
/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
@@ -2514,9 +2458,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
return CreateAdd(StartValue, Offset);
}
case InductionDescriptor::IK_PtrInduction: {
- assert(isa<Constant>(Step) &&
- "Expected constant step for pointer induction");
- return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
+ return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
}
case InductionDescriptor::IK_FpInduction: {
assert(!isa<VectorType>(Index->getType()) &&
@@ -2538,6 +2480,50 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
llvm_unreachable("invalid enum");
}
+std::optional<unsigned> getMaxVScale(const Function &F,
+ const TargetTransformInfo &TTI) {
+ if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
+ return MaxVScale;
+
+ if (F.hasFnAttribute(Attribute::VScaleRange))
+ return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
+
+ return std::nullopt;
+}
+
+/// For the given VF and UF and maximum trip count computed for the loop, return
+/// whether the induction variable might overflow in the vectorized loop. If not,
+/// then we know a runtime overflow check always evaluates to false and can be
+/// removed.
+static bool isIndvarOverflowCheckKnownFalse(
+ const LoopVectorizationCostModel *Cost,
+ ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
+ // Always be conservative if we don't know the exact unroll factor.
+ unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
+
+ Type *IdxTy = Cost->Legal->getWidestInductionType();
+ APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
+
+ // We know the runtime overflow check is known false iff the (max) trip-count
+ // is known and (max) trip-count + (VF * UF) does not overflow in the type of
+ // the vector loop induction variable.
+ if (unsigned TC =
+ Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
+ uint64_t MaxVF = VF.getKnownMinValue();
+ if (VF.isScalable()) {
+ std::optional<unsigned> MaxVScale =
+ getMaxVScale(*Cost->TheFunction, Cost->TTI);
+ if (!MaxVScale)
+ return false;
+ MaxVF *= *MaxVScale;
+ }
+
+ return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
+ }
+
+ return false;
+}
+
void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
const VPIteration &Instance,
VPTransformState &State) {
@@ -2591,14 +2577,13 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
void InnerLoopVectorizer::vectorizeInterleaveGroup(
const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask) {
+ VPValue *BlockInMask, bool NeedsMaskForGaps) {
Instruction *Instr = Group->getInsertPos();
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
// Prepare for the new pointers.
@@ -2609,14 +2594,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");
+ Value *Idx;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
- if (Group->isReverse())
- Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
+ if (Group->isReverse()) {
+ Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
+ Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
+ Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
+ Idx = Builder.CreateNeg(Idx);
+ } else
+ Idx = Builder.getInt32(-Index);
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
@@ -2637,8 +2629,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
InBounds = gep->isInBounds();
- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
- cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
// Cast to the vector pointer type.
unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
@@ -2649,14 +2640,43 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
State.setDebugLocFromInst(Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
- Value *MaskForGaps = nullptr;
- if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
- assert(MaskForGaps && "Mask for Gaps is required but it is null");
- }
+ auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
+ unsigned Part, Value *MaskForGaps) -> Value * {
+ if (VF.isScalable()) {
+ assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
+ assert(InterleaveFactor == 2 &&
+ "Unsupported deinterleave factor for scalable vectors");
+ auto *BlockInMaskPart = State.get(BlockInMask, Part);
+ SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
+ auto *MaskTy =
+ VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
+ return Builder.CreateIntrinsic(
+ MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
+ /*FMFSource=*/nullptr, "interleaved.mask");
+ }
+
+ if (!BlockInMask)
+ return MaskForGaps;
+
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+ "interleaved.mask");
+ return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ };
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
+ Value *MaskForGaps = nullptr;
+ if (NeedsMaskForGaps) {
+ MaskForGaps =
+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2664,18 +2684,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (BlockInMask || MaskForGaps) {
assert(useMaskedInterleavedAccesses(*TTI) &&
"masked interleaved groups are not allowed.");
- Value *GroupMask = MaskForGaps;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- GroupMask = MaskForGaps
- ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
- MaskForGaps)
- : ShuffledMask;
- }
+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
NewLoad =
Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
GroupMask, PoisonVec, "wide.masked.vec");
@@ -2687,6 +2696,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
NewLoads.push_back(NewLoad);
}
+ if (VecTy->isScalableTy()) {
+ assert(InterleaveFactor == 2 &&
+ "Unsupported deinterleave factor for scalable vectors");
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
+ /*FMFSource=*/nullptr, "strided.vec");
+ unsigned J = 0;
+ for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ if (!Member)
+ continue;
+
+ Value *StridedVec = Builder.CreateExtractValue(DI, I);
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
+ }
+
+ if (Group->isReverse())
+ StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
+
+ State.set(VPDefs[J], StridedVec, Part);
+ ++J;
+ }
+ }
+
+ return;
+ }
+
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
unsigned J = 0;
@@ -2724,7 +2768,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+ Value *MaskForGaps =
+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
"masked interleaved groups are not allowed.");
assert((!MaskForGaps || !VF.isScalable()) &&
@@ -2759,27 +2804,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
StoredVecs.push_back(StoredVec);
}
- // Concatenate all vectors into a wide vector.
- Value *WideVec = concatenateVectors(Builder, StoredVecs);
-
- // Interleave the elements in the wide vector.
- Value *IVec = Builder.CreateShuffleVector(
- WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
- "interleaved.vec");
-
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
- Value *GroupMask = MaskForGaps;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
- ShuffledMask, MaskForGaps)
- : ShuffledMask;
- }
+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
Group->getAlign(), GroupMask);
} else
@@ -2793,7 +2822,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
- bool IfPredicateInstr,
VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
@@ -2810,14 +2838,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
- // If the scalarized instruction contributes to the address computation of a
- // widen masked load/store which was in a basic block that needed predication
- // and is not predicated after vectorization, we can't propagate
- // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
- // instruction could feed a poison value to the base address of the widen
- // load/store.
- if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
- Cloned->dropPoisonGeneratingFlags();
+ RepRecipe->setFlags(Cloned);
if (Instr->getDebugLoc())
State.setDebugLocFromInst(Instr);
@@ -2843,45 +2864,17 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
AC->registerAssumption(II);
// End if-block.
+ bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
-Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
- if (TripCount)
- return TripCount;
-
- assert(InsertBlock);
- IRBuilder<> Builder(InsertBlock->getTerminator());
- // Find the loop boundaries.
- Type *IdxTy = Legal->getWidestInductionType();
- assert(IdxTy && "No type for induction");
- const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
-
- const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
-
- // Expand the trip count and place the new instructions in the preheader.
- // Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-
- // Count holds the overall loop count (N).
- TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- InsertBlock->getTerminator());
-
- if (TripCount->getType()->isPointerTy())
- TripCount =
- CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
- InsertBlock->getTerminator());
-
- return TripCount;
-}
-
Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;
- Value *TC = getOrCreateTripCount(InsertBlock);
+ Value *TC = getTripCount();
IRBuilder<> Builder(InsertBlock->getTerminator());
Type *Ty = TC->getType();
@@ -2917,7 +2910,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
// the step does not evenly divide the trip count, no adjustment is necessary
// since there will already be scalar iterations. Note that the minimum
// iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF)) {
+ if (Cost->requiresScalarEpilogue(VF.isVector())) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -2930,10 +2923,10 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
- auto *DstFVTy = cast<FixedVectorType>(DstVTy);
- unsigned VF = DstFVTy->getNumElements();
- auto *SrcVecTy = cast<FixedVectorType>(V->getType());
- assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ auto *DstFVTy = cast<VectorType>(DstVTy);
+ auto VF = DstFVTy->getElementCount();
+ auto *SrcVecTy = cast<VectorType>(V->getType());
+ assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstFVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
@@ -2953,13 +2946,13 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
"Only one type should be a floating point type");
Type *IntTy =
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
- auto *VecIntTy = FixedVectorType::get(IntTy, VF);
+ auto *VecIntTy = VectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -2970,8 +2963,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
Type *CountTy = Count->getType();
@@ -2989,10 +2982,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
};
- if (!Cost->foldTailByMasking())
+ TailFoldingStyle Style = Cost->getTailFoldingStyle();
+ if (Style == TailFoldingStyle::None)
CheckMinIters =
Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
- else if (VF.isScalable()) {
+ else if (VF.isScalable() &&
+ !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
+ Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.
@@ -3017,7 +3013,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// Update dominator for Bypass & LoopExit (if needed).
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3044,7 +3040,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3097,7 +3093,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -3117,17 +3113,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
// branch from the middle block to the loop scalar preheader, and the
// exit block. completeLoopSkeleton will update the condition to use an
// iteration check, if required to decide whether to execute the remainder.
- BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
- BranchInst::Create(LoopScalarPreHeader) :
- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
- Builder.getTrue());
+ BranchInst *BrInst =
+ Cost->requiresScalarEpilogue(VF.isVector())
+ ? BranchInst::Create(LoopScalarPreHeader)
+ : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
+ Builder.getTrue());
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
// Update dominator for loop exit. During skeleton creation, only the vector
// pre-header and the middle block are created. The vector loop is entirely
// created during VPlan exection.
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3135,7 +3132,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
}
PHINode *InnerLoopVectorizer::createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &II,
+ PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
@@ -3154,8 +3151,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
- Value *Step =
- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
EndValue =
emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
EndValue->setName("ind.end");
@@ -3163,8 +3158,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
- Value *Step =
- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
EndValueFromAdditionalBypass = emitTransformedIndex(
B, AdditionalBypass.second, II.getStartValue(), Step, II);
EndValueFromAdditionalBypass->setName("ind.end");
@@ -3193,7 +3186,22 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
return BCResumeVal;
}
+/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
+/// expansion results.
+static Value *getExpandedStep(const InductionDescriptor &ID,
+ const SCEV2ValueTy &ExpandedSCEVs) {
+ const SCEV *Step = ID.getStep();
+ if (auto *C = dyn_cast<SCEVConstant>(Step))
+ return C->getValue();
+ if (auto *U = dyn_cast<SCEVUnknown>(Step))
+ return U->getValue();
+ auto I = ExpandedSCEVs.find(Step);
+ assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
+ return I->second;
+}
+
void InnerLoopVectorizer::createInductionResumeValues(
+ const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
@@ -3209,14 +3217,15 @@ void InnerLoopVectorizer::createInductionResumeValues(
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
+ OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
+ AdditionalBypass);
OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
}
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// The trip counts should be cached by now.
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
@@ -3229,7 +3238,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can use the previous value for the condition (true).
// 3) Otherwise, construct a runtime check.
- if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
+ if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
+ !Cost->foldTailByMasking()) {
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
Count, VectorTripCount, "cmp.n",
LoopMiddleBlock->getTerminator());
@@ -3250,14 +3260,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
}
std::pair<BasicBlock *, Value *>
-InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+InnerLoopVectorizer::createVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- loop iteration number check.
- / |
+ [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
+ / | preheader are expanded here. Eventually all required SCEV
+ / | expansion should happen here.
/ v
| [ ] <-- vector loop bypass (may consist of multiple blocks).
| / |
@@ -3304,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
emitMemRuntimeChecks(LoopScalarPreHeader);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues();
+ createInductionResumeValues(ExpandedSCEVs);
return {completeLoopSkeleton(), nullptr};
}
@@ -3317,7 +3329,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock,
- BasicBlock *VectorHeader, VPlan &Plan) {
+ BasicBlock *VectorHeader, VPlan &Plan,
+ VPTransformState &State) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
@@ -3345,7 +3358,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
-
IRBuilder<> B(MiddleBlock->getTerminator());
// Fast-math-flags propagate from the original induction instruction.
@@ -3355,8 +3367,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
Value *CountMinusOne = B.CreateSub(
VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
CountMinusOne->setName("cmo");
- Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
- VectorHeader->getTerminator());
+
+ VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
+ assert(StepVPV && "step must have been expanded during VPlan execution");
+ Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
+ : State.get(StepVPV, {0, 0});
Value *Escape =
emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
@@ -3430,12 +3445,12 @@ static void cse(BasicBlock *BB) {
}
}
-InstructionCost
-LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize) const {
+InstructionCost LoopVectorizationCostModel::getVectorCallCost(
+ CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
+ bool MaskRequired = Legal->isMaskRequired(CI);
for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());
@@ -3464,18 +3479,39 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
- NeedToScalarize = true;
- VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+ InstructionCost MaskCost = 0;
+ VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
+ if (NeedsMask)
+ *NeedsMask = MaskRequired;
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we want an unmasked vector function but can't find one matching the VF,
+ // maybe we can find vector function that does use a mask and synthesize
+ // an all-true mask.
+ if (!VecFunc && !MaskRequired) {
+ Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
+ VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we found one, add in the cost of creating a mask
+ if (VecFunc) {
+ if (NeedsMask)
+ *NeedsMask = true;
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(
+ IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
+ VF));
+ }
+ }
+ // We don't support masked function calls yet, but we can scalarize a
+ // masked call with branches (unless VF is scalable).
if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return Cost;
+ return VF.isScalable() ? InstructionCost::getInvalid() : Cost;
// If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
if (VectorCallCost < Cost) {
- NeedToScalarize = false;
+ *Variant = VecFunc;
Cost = VectorCallCost;
}
return Cost;
@@ -3675,14 +3711,25 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
+ // After vectorization, the exit blocks of the original loop will have
+ // additional predecessors. Invalidate SCEVs for the exit phis in case SE
+ // looked through single-entry phis.
+ SmallVector<BasicBlock *> ExitBlocks;
+ OrigLoop->getExitBlocks(ExitBlocks);
+ for (BasicBlock *Exit : ExitBlocks)
+ for (PHINode &PN : Exit->phis())
+ PSE.getSE()->forgetValue(&PN);
+
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
- if (Cost->requiresScalarEpilogue(VF)) {
+ if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
- Plan.clearLiveOuts();
} else {
+ // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
+ // the cost model.
+
// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.
@@ -3692,7 +3739,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
IVEndValues[Entry.first], LoopMiddleBlock,
- VectorLoop->getHeader(), Plan);
+ VectorLoop->getHeader(), Plan, State);
}
// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
@@ -3799,31 +3846,53 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Value *Incoming = State.get(PreviousDef, UF - 1);
auto *ExtractForScalar = Incoming;
auto *IdxTy = Builder.getInt32Ty();
+ Value *RuntimeVF = nullptr;
if (VF.isVector()) {
auto *One = ConstantInt::get(IdxTy, 1);
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
+ RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
- ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
- "vector.recur.extract");
- }
- // Extract the second last element in the middle block if the
- // Phi is used outside the loop. We need to extract the phi itself
- // and not the last element (the phi update in the current iteration). This
- // will be the value when jumping to the exit block from the LoopMiddleBlock,
- // when the scalar loop is not run at all.
- Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF.isVector()) {
- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Idx, "vector.recur.extract.for.phi");
- } else if (UF > 1)
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
- // of `Incoming`. This is analogous to the vectorized case above: extracting
- // the second last element when VF > 1.
- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
+ ExtractForScalar =
+ Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
+ }
+
+ auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
+ assert(PhiR->getNumUsers() == 1 &&
+ RecurSplice->getOpcode() ==
+ VPInstruction::FirstOrderRecurrenceSplice &&
+ "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
+ SmallVector<VPLiveOut *> LiveOuts;
+ for (VPUser *U : RecurSplice->users())
+ if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
+ LiveOuts.push_back(LiveOut);
+
+ if (!LiveOuts.empty()) {
+ // Extract the second last element in the middle block if the
+ // Phi is used outside the loop. We need to extract the phi itself
+ // and not the last element (the phi update in the current iteration). This
+ // will be the value when jumping to the exit block from the
+ // LoopMiddleBlock, when the scalar loop is not run at all.
+ Value *ExtractForPhiUsedOutsideLoop = nullptr;
+ if (VF.isVector()) {
+ auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+ Incoming, Idx, "vector.recur.extract.for.phi");
+ } else {
+ assert(UF > 1 && "VF and UF cannot both be 1");
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
+ // value of `Incoming`. This is analogous to the vectorized case above:
+ // extracting the second last element when VF > 1.
+ ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
+ }
+
+ for (VPLiveOut *LiveOut : LiveOuts) {
+ assert(!Cost->requiresScalarEpilogue(VF.isVector()));
+ PHINode *LCSSAPhi = LiveOut->getPhi();
+ LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+ State.Plan->removeLiveOut(LCSSAPhi);
+ }
+ }
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
@@ -3837,22 +3906,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
Phi->setName("scalar.recur");
-
- // Finally, fix users of the recurrence outside the loop. The users will need
- // either the last value of the scalar recurrence or the last value of the
- // vector recurrence we extracted in the middle block. Since the loop is in
- // LCSSA form, we just need to find all the phi nodes for the original scalar
- // recurrence in the exit block, and then add an edge for the middle block.
- // Note that LCSSA does not imply single entry when the original scalar loop
- // had multiple exiting edges (as we always run the last iteration in the
- // scalar epilogue); in that case, there is no edge from middle to exit and
- // and thus no phis which needed updated.
- if (!Cost->requiresScalarEpilogue(VF))
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
- LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- State.Plan->removeLiveOut(&LCSSAPhi);
- }
}
void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
@@ -3872,9 +3925,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
- // Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(PhiR, State);
-
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
@@ -4036,7 +4086,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// We know that the loop is in LCSSA form. We need to update the PHI nodes
// in the exit blocks. See comment on analogous loop in
// fixFixedOrderRecurrence for a more complete explaination of the logic.
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
@@ -4054,38 +4104,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
-void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
- VPTransformState &State) {
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RK != RecurKind::Add && RK != RecurKind::Mul)
- return;
-
- SmallVector<VPValue *, 8> Worklist;
- SmallPtrSet<VPValue *, 8> Visited;
- Worklist.push_back(PhiR);
- Visited.insert(PhiR);
-
- while (!Worklist.empty()) {
- VPValue *Cur = Worklist.pop_back_val();
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *V = State.get(Cur, Part);
- if (!isa<OverflowingBinaryOperator>(V))
- break;
- cast<Instruction>(V)->dropPoisonGeneratingFlags();
- }
-
- for (VPUser *U : Cur->users()) {
- auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
- if (!UserRecipe)
- continue;
- for (VPValue *V : UserRecipe->definedValues())
- if (Visited.insert(V).second)
- Worklist.push_back(V);
- }
- }
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -4125,10 +4143,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
// We can't sink an instruction if it is a phi node, is not in the loop,
- // or may have side effects.
+ // may have side effects or may read from memory.
+ // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
- I->mayHaveSideEffects())
- continue;
+ I->mayHaveSideEffects() || I->mayReadFromMemory())
+ continue;
// If the instruction is already in PredBB, check if we can sink its
// operands. In that case, VPlan's sinkScalarOperands() succeeded in
@@ -4189,7 +4208,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
- assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
+ assert(VF.isVector() && !Scalars.contains(VF) &&
"This function should not be visited twice for the same VF");
// This avoids any chances of creating a REPLICATE recipe during planning
@@ -4382,6 +4401,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
switch(I->getOpcode()) {
default:
return true;
+ case Instruction::Call:
+ return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -4430,10 +4451,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// both speculation safety (which follows from the same argument as loads),
// but also must prove the value being stored is correct. The easiest
// form of the later is to require that all values stored are the same.
- if (Legal->isUniformMemOp(*I) &&
- (isa<LoadInst>(I) ||
- (isa<StoreInst>(I) &&
- TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
+ if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
+ (isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
!Legal->blockNeedsPredication(I->getParent()))
return false;
return true;
@@ -4445,6 +4466,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning
return !isSafeToSpeculativelyExecute(I);
+ case Instruction::Call:
+ return Legal->isMaskRequired(I);
}
}
@@ -4502,7 +4525,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+ Legal->isInvariant(Op2))
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
@@ -4614,7 +4638,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
- assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
+ assert(VF.isVector() && !Uniforms.contains(VF) &&
"This function should not be visited twice for the same VF");
// Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4663,10 +4687,18 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
addToWorklistIfAllowed(Cmp);
+ auto PrevVF = VF.divideCoefficientBy(2);
// Return true if all lanes perform the same memory operation, and we can
// thus chose to execute only one.
auto isUniformMemOpUse = [&](Instruction *I) {
- if (!Legal->isUniformMemOp(*I))
+ // If the value was already known to not be uniform for the previous
+ // (smaller VF), it cannot be uniform for the larger VF.
+ if (PrevVF.isVector()) {
+ auto Iter = Uniforms.find(PrevVF);
+ if (Iter != Uniforms.end() && !Iter->second.contains(I))
+ return false;
+ }
+ if (!Legal->isUniformMemOp(*I, VF))
return false;
if (isa<LoadInst>(I))
// Loading the same address always produces the same result - at least
@@ -4689,11 +4721,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
WideningDecision == CM_Interleave);
};
-
// Returns true if Ptr is the pointer operand of a memory access instruction
- // I, and I is known to not require scalarization.
+ // I, I is known to not require scalarization, and the pointer is not also
+ // stored.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
+ return false;
+ return getLoadStorePointerOperand(I) == Ptr &&
+ (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
};
// Holds a list of values which are known to have at least one uniform use.
@@ -4739,10 +4774,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (isUniformMemOpUse(&I))
addToWorklistIfAllowed(&I);
- if (isUniformDecision(&I, VF)) {
- assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+ if (isVectorizedMemAccessUse(&I, Ptr))
HasUniformUse.insert(Ptr);
- }
}
// Add to the worklist any operands which have *only* uniform (e.g. lane 0
@@ -4906,12 +4939,11 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
// Limit MaxScalableVF by the maximum safe dependence distance.
- std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
- MaxVScale =
- TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
- MaxScalableVF =
- ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
+ if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
+ MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
+ else
+ MaxScalableVF = ElementCount::getScalable(0);
+
if (!MaxScalableVF)
reportVectorizationInfo(
"Max legal vector width too small, scalable vectorization "
@@ -4932,7 +4964,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeElements =
- PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
@@ -5105,16 +5137,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
+
// Avoid tail folding if the trip count is known to be a multiple of any VF
- // we chose.
- // FIXME: The condition below pessimises the case for fixed-width vectors,
- // when scalable VFs are also candidates for vectorization.
- if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
- ElementCount MaxFixedVF = MaxFactors.FixedVF;
- assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+ // we choose.
+ std::optional<unsigned> MaxPowerOf2RuntimeVF =
+ MaxFactors.FixedVF.getFixedValue();
+ if (MaxFactors.ScalableVF) {
+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
+ if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
+ MaxPowerOf2RuntimeVF = std::max<unsigned>(
+ *MaxPowerOf2RuntimeVF,
+ *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
+ } else
+ MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
+ }
+
+ if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+ assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
- unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
- : MaxFixedVF.getFixedValue();
+ unsigned MaxVFtimesIC =
+ UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
const SCEV *ExitCount = SE->getAddExpr(
@@ -5134,7 +5176,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
- FoldTailByMasking = true;
+ CanFoldTailByMasking = true;
return MaxFactors;
}
@@ -5187,7 +5229,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
auto MaxVectorElementCount = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
+ llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
ComputeScalableMaxVF);
MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
@@ -5207,6 +5249,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
auto Min = Attr.getVScaleRangeMin();
WidestRegisterMinEC *= Min;
}
+
+ // When a scalar epilogue is required, at least one iteration of the scalar
+ // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
+ // max VF that results in a dead vector loop.
+ if (ConstTripCount > 0 && requiresScalarEpilogue(true))
+ ConstTripCount -= 1;
+
if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
// If loop trip count (TC) is known at compile time there is no point in
@@ -5214,7 +5263,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// power of two which doesn't exceed TC.
// If MaxVectorElementCount is scalable, we only fall back on a fixed VF
// when the TC is less than or equal to the known number of lanes.
- auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
+ auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: "
<< ClampedConstTripCount << "\n");
@@ -5228,7 +5277,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
+ llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
@@ -5273,9 +5322,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}
-std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
- if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
- auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+/// Convenience function that returns the value of vscale_range iff
+/// vscale_range.min == vscale_range.max or otherwise returns the value
+/// returned by the corresponding TTI method.
+static std::optional<unsigned>
+getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
+ const Function *Fn = L->getHeader()->getParent();
+ if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max)
@@ -5285,31 +5339,39 @@ std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
return TTI.getVScaleForTuning();
}
-bool LoopVectorizationCostModel::isMoreProfitable(
+bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
- unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
-
- if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
- MaxTripCount) {
- // If we are folding the tail and the trip count is a known (possibly small)
- // constant, the trip count will be rounded up to an integer number of
- // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
- // which we compare directly. When not folding the tail, the total cost will
- // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
- // approximated with the per-lane cost below instead of using the tripcount
- // as here.
- auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
- auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
+ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
+
+ if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
+ // If the trip count is a known (possibly small) constant, the trip count
+ // will be rounded up to an integer number of iterations under
+ // FoldTailByMasking. The total cost in that case will be
+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+ // some extra overheads, but for the purpose of comparing the costs of
+ // different VFs we can use this to compare the total loop-body cost
+ // expected after vectorization.
+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+ InstructionCost VectorCost,
+ InstructionCost ScalarCost) {
+ return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
+ : VectorCost * (MaxTripCount / VF) +
+ ScalarCost * (MaxTripCount % VF);
+ };
+ auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
+ auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+
return RTCostA < RTCostB;
}
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
- if (std::optional<unsigned> VScale = getVScaleForTuning()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
if (A.Width.isScalable())
EstimatedWidthA *= *VScale;
if (B.Width.isScalable())
@@ -5328,9 +5390,74 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
}
-VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
+static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
+ OptimizationRemarkEmitter *ORE,
+ Loop *TheLoop) {
+ if (InvalidCosts.empty())
+ return;
+
+ // Emit a report of VFs with invalid costs in the loop.
+
+ // Group the remarks per instruction, keeping the instruction order from
+ // InvalidCosts.
+ std::map<Instruction *, unsigned> Numbering;
+ unsigned I = 0;
+ for (auto &Pair : InvalidCosts)
+ if (!Numbering.count(Pair.first))
+ Numbering[Pair.first] = I++;
+
+ // Sort the list, first on instruction(number) then on VF.
+ sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+ if (Numbering[A.first] != Numbering[B.first])
+ return Numbering[A.first] < Numbering[B.first];
+ ElementCountComparator ECC;
+ return ECC(A.second, B.second);
+ });
+
+ // For a list of ordered instruction-vf pairs:
+ // [(load, vf1), (load, vf2), (store, vf1)]
+ // Group the instructions together to emit separate remarks for:
+ // load (vf1, vf2)
+ // store (vf1)
+ auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
+ auto Subset = ArrayRef<InstructionVFPair>();
+ do {
+ if (Subset.empty())
+ Subset = Tail.take_front(1);
+
+ Instruction *I = Subset.front().first;
+
+ // If the next instruction is different, or if there are no other pairs,
+ // emit a remark for the collated subset. e.g.
+ // [(load, vf1), (load, vf2))]
+ // to emit:
+ // remark: invalid costs for 'load' at VF=(vf, vf2)
+ if (Subset == Tail || Tail[Subset.size()].first != I) {
+ std::string OutString;
+ raw_string_ostream OS(OutString);
+ assert(!Subset.empty() && "Unexpected empty range");
+ OS << "Instruction with invalid costs prevented vectorization at VF=(";
+ for (const auto &Pair : Subset)
+ OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
+ OS << "):";
+ if (auto *CI = dyn_cast<CallInst>(I))
+ OS << " call to " << CI->getCalledFunction()->getName();
+ else
+ OS << " " << I->getOpcodeName();
+ OS.flush();
+ reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+ Tail = Tail.drop_front(Subset.size());
+ Subset = {};
+ } else
+ // Grow the subset by one element
+ Subset = Tail.take_front(Subset.size() + 1);
+ } while (!Tail.empty());
+}
+
+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
const ElementCountSet &VFCandidates) {
- InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+ InstructionCost ExpectedCost =
+ CM.expectedCost(ElementCount::getFixed(1)).first;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
assert(VFCandidates.count(ElementCount::getFixed(1)) &&
@@ -5340,7 +5467,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
- bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && VFCandidates.size() > 1) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
@@ -5354,12 +5481,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
if (i.isScalar())
continue;
- VectorizationCostTy C = expectedCost(i, &InvalidCosts);
+ LoopVectorizationCostModel::VectorizationCostTy C =
+ CM.expectedCost(i, &InvalidCosts);
VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
- if (std::optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
AssumedMinimumVscale = *VScale;
unsigned Width =
Candidate.Width.isScalable()
@@ -5388,70 +5516,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
ChosenFactor = Candidate;
}
- // Emit a report of VFs with invalid costs in the loop.
- if (!InvalidCosts.empty()) {
- // Group the remarks per instruction, keeping the instruction order from
- // InvalidCosts.
- std::map<Instruction *, unsigned> Numbering;
- unsigned I = 0;
- for (auto &Pair : InvalidCosts)
- if (!Numbering.count(Pair.first))
- Numbering[Pair.first] = I++;
-
- // Sort the list, first on instruction(number) then on VF.
- llvm::sort(InvalidCosts,
- [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
- if (Numbering[A.first] != Numbering[B.first])
- return Numbering[A.first] < Numbering[B.first];
- ElementCountComparator ECC;
- return ECC(A.second, B.second);
- });
-
- // For a list of ordered instruction-vf pairs:
- // [(load, vf1), (load, vf2), (store, vf1)]
- // Group the instructions together to emit separate remarks for:
- // load (vf1, vf2)
- // store (vf1)
- auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
- auto Subset = ArrayRef<InstructionVFPair>();
- do {
- if (Subset.empty())
- Subset = Tail.take_front(1);
-
- Instruction *I = Subset.front().first;
-
- // If the next instruction is different, or if there are no other pairs,
- // emit a remark for the collated subset. e.g.
- // [(load, vf1), (load, vf2))]
- // to emit:
- // remark: invalid costs for 'load' at VF=(vf, vf2)
- if (Subset == Tail || Tail[Subset.size()].first != I) {
- std::string OutString;
- raw_string_ostream OS(OutString);
- assert(!Subset.empty() && "Unexpected empty range");
- OS << "Instruction with invalid costs prevented vectorization at VF=(";
- for (const auto &Pair : Subset)
- OS << (Pair.second == Subset.front().second ? "" : ", ")
- << Pair.second;
- OS << "):";
- if (auto *CI = dyn_cast<CallInst>(I))
- OS << " call to " << CI->getCalledFunction()->getName();
- else
- OS << " " << I->getOpcodeName();
- OS.flush();
- reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
- Tail = Tail.drop_front(Subset.size());
- Subset = {};
- } else
- // Grow the subset by one element
- Subset = Tail.take_front(Subset.size() + 1);
- } while (!Tail.empty());
- }
+ emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
- if (!EnableCondStoresVectorization && NumPredStores) {
- reportVectorizationFailure("There are conditional stores.",
+ if (!EnableCondStoresVectorization && CM.hasPredStores()) {
+ reportVectorizationFailure(
+ "There are conditional stores.",
"store that is conditionally executed prevents vectorization",
- "ConditionalStore", ORE, TheLoop);
+ "ConditionalStore", ORE, OrigLoop);
ChosenFactor = ScalarCost;
}
@@ -5463,11 +5534,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
return ChosenFactor;
}
-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
- const Loop &L, ElementCount VF) const {
+bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
+ ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
- if (any_of(L.getHeader()->phis(),
+ if (any_of(OrigLoop->getHeader()->phis(),
[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
return false;
@@ -5475,20 +5546,21 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
// currently unsupported.
for (const auto &Entry : Legal->getInductionVars()) {
// Look for uses of the value of the induction at the last iteration.
- Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+ Value *PostInc =
+ Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users())
- if (!L.contains(cast<Instruction>(U)))
+ if (!OrigLoop->contains(cast<Instruction>(U)))
return false;
// Look for uses of penultimate value of the induction.
for (User *U : Entry.first->users())
- if (!L.contains(cast<Instruction>(U)))
+ if (!OrigLoop->contains(cast<Instruction>(U)))
return false;
}
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
- if (L.getExitingBlock() != L.getLoopLatch())
+ if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
return true;
@@ -5507,62 +5579,59 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// We also consider epilogue vectorization unprofitable for targets that don't
// consider interleaving beneficial (eg. MVE).
- if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+ if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- // FIXME: We should consider changing the threshold for scalable
- // vectors to take VScaleForTuning into account.
- if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
+
+ unsigned Multiplier = 1;
+ if (VF.isScalable())
+ Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
+ if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
}
-VectorizationFactor
-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
- const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
+ const ElementCount MainLoopVF, unsigned IC) {
VectorizationFactor Result = VectorizationFactor::Disabled();
if (!EnableEpilogueVectorization) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
return Result;
}
- if (!isScalarEpilogueAllowed()) {
- LLVM_DEBUG(
- dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
- "allowed.\n";);
+ if (!CM.isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
+ "epilogue is allowed.\n");
return Result;
}
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
- if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
- LLVM_DEBUG(
- dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
- "not a supported candidate.\n";);
+ if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
+ "is not a supported candidate.\n");
return Result;
}
if (EpilogueVectorizationForceVF > 1) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
- if (LVP.hasPlanWithVF(ForcedEC))
+ if (hasPlanWithVF(ForcedEC))
return {ForcedEC, 0, 0};
else {
- LLVM_DEBUG(
- dbgs()
- << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
+ "viable.\n");
return Result;
}
}
- if (TheLoop->getHeader()->getParent()->hasOptSize() ||
- TheLoop->getHeader()->getParent()->hasMinSize()) {
+ if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
+ OrigLoop->getHeader()->getParent()->hasMinSize()) {
LLVM_DEBUG(
- dbgs()
- << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+ dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
return Result;
}
- if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
+ if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
@@ -5574,21 +5643,48 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (std::optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedRuntimeVF *= *VScale;
}
- for (auto &NextVF : ProfitableVFs)
- if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
- ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
- ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
- (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
- LVP.hasPlanWithVF(NextVF.Width))
+ ScalarEvolution &SE = *PSE.getSE();
+ Type *TCType = Legal->getWidestInductionType();
+ const SCEV *RemainingIterations = nullptr;
+ for (auto &NextVF : ProfitableVFs) {
+ // Skip candidate VFs without a corresponding VPlan.
+ if (!hasPlanWithVF(NextVF.Width))
+ continue;
+
+ // Skip candidate VFs with widths >= the estimate runtime VF (scalable
+ // vectors) or the VF of the main loop (fixed vectors).
+ if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+ ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
+ ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
+ continue;
+
+ // If NextVF is greater than the number of remaining iterations, the
+ // epilogue loop would be dead. Skip such factors.
+ if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
+ // TODO: extend to support scalable VFs.
+ if (!RemainingIterations) {
+ const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
+ RemainingIterations = SE.getURemExpr(
+ TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
+ }
+ if (SE.isKnownPredicate(
+ CmpInst::ICMP_UGT,
+ SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+ RemainingIterations))
+ continue;
+ }
+
+ if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
Result = NextVF;
+ }
if (Result != VectorizationFactor::Disabled())
LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
- << Result.Width << "\n";);
+ << Result.Width << "\n");
return Result;
}
@@ -5688,7 +5784,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
return 1;
// We used the distance for the interleave count.
- if (Legal->getMaxSafeDepDistBytes() != -1U)
+ if (!Legal->isSafeForAnyVectorWidth())
return 1;
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
@@ -5750,20 +5846,19 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
- unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
+ MaxLocalUsers);
// Don't count the induction variable as interleaved.
if (EnableIndVarRegisterHeur) {
- TmpIC =
- PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
- std::max(1U, (MaxLocalUsers - 1)));
+ TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
}
IC = std::min(IC, TmpIC);
}
// Clamp the interleave ranges to reasonable counts.
- unsigned MaxInterleaveCount =
- TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
// Check if the user has overridden the max.
if (VF.isScalar()) {
@@ -5834,8 +5929,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
- unsigned SmallIC = std::min(
- IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
+ unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
+ SmallLoopCost / *LoopCost.getValue()));
// Interleave until store/load ports (estimated by max interleave count) are
// saturated.
@@ -5953,7 +6048,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// Saves the list of values that are used in the loop but are defined outside
// the loop (not including non-instruction values such as arguments and
// constants).
- SmallPtrSet<Value *, 8> LoopInvariants;
+ SmallSetVector<Instruction *, 8> LoopInvariants;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -6079,11 +6174,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
for (auto *Inst : LoopInvariants) {
// FIXME: The target might use more than one register for the type
// even in the scalar case.
- unsigned Usage =
- VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ bool IsScalar = all_of(Inst->users(), [&](User *U) {
+ auto *I = cast<Instruction>(U);
+ return TheLoop != LI->getLoopFor(I->getParent()) ||
+ isScalarAfterVectorization(I, VFs[i]);
+ });
+
+ ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
unsigned ClassID =
- TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
- Invariant[ClassID] += Usage;
+ TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
+ Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
}
LLVM_DEBUG({
@@ -6134,8 +6234,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
- if (VF.isScalar() || VF.isZero() ||
- InstsToScalarize.find(VF) != InstsToScalarize.end())
+ if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
return;
// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -6224,7 +6323,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Instruction *I = Worklist.pop_back_val();
// If we've already analyzed the instruction, there's nothing to do.
- if (ScalarCosts.find(I) != ScalarCosts.end())
+ if (ScalarCosts.contains(I))
continue;
// Compute the cost of the vector instruction. Note that this cost already
@@ -6362,11 +6461,6 @@ static const SCEV *getAddressAccessSCEV(
return PSE.getSCEV(Ptr);
}
-static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
- return Legal->hasStride(I->getOperand(0)) ||
- Legal->hasStride(I->getOperand(1));
-}
-
InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
@@ -6460,7 +6554,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
- assert(Legal->isUniformMemOp(*I));
+ assert(Legal->isUniformMemOp(*I, VF));
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
@@ -6475,7 +6569,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
}
StoreInst *SI = cast<StoreInst>(I);
- bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
+ bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
@@ -6502,11 +6596,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
- // TODO: Once we have support for interleaving with scalable vectors
- // we can calculate the cost properly here.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
-
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6836,7 +6925,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;
- if (Legal->isUniformMemOp(I)) {
+ if (Legal->isUniformMemOp(I, VF)) {
auto isLegalToScalarize = [&]() {
if (!VF.isScalable())
// Scalarization of fixed length vectors "just works".
@@ -7134,8 +7223,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- // Since we will replace the stride by 1 the multiplication should go away.
- if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+ // If we're speculating on the stride being 1, the multiplication may
+ // fold away. We can generalize this for all operations using the notion
+ // of neutral elements. (TODO)
+ if (I->getOpcode() == Instruction::Mul &&
+ (PSE.getSCEV(I->getOperand(0))->isOne() ||
+ PSE.getSCEV(I->getOperand(1))->isOne()))
return 0;
// Detect reduction patterns
@@ -7146,7 +7239,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+ Legal->isInvariant(Op2))
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
@@ -7304,7 +7398,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
VectorTy =
largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
} else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
- SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ // Leave SrcVecTy unchanged - we only shrink the destination element
+ // type.
VectorTy =
smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
}
@@ -7316,9 +7411,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;
- bool NeedToScalarize;
+ Function *Variant;
CallInst *CI = cast<CallInst>(I);
- InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);
@@ -7339,37 +7434,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
} // end of switch.
}
-char LoopVectorize::ID = 0;
-
-static const char lv_name[] = "Loop Vectorization";
-
-INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
-
-namespace llvm {
-
-Pass *createLoopVectorizePass() { return new LoopVectorize(); }
-
-Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
- bool VectorizeOnlyWhenForced) {
- return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
-}
-
-} // end namespace llvm
-
void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
@@ -7462,7 +7526,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
// reasonable one.
if (UserVF.isZero()) {
VF = ElementCount::getFixed(determineVPlanVF(
- TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedValue(),
CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7497,13 +7561,16 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
std::optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ CM.collectValuesToIgnore();
+ CM.collectElementTypesForWidening();
+
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return std::nullopt;
// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
- !useMaskedInterleavedAccesses(*TTI)) {
+ !useMaskedInterleavedAccesses(TTI)) {
LLVM_DEBUG(
dbgs()
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
@@ -7527,6 +7594,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
+ if (!hasPlanWithVF(UserVF)) {
+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
+ << ".\n");
+ return std::nullopt;
+ }
+
LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0, 0}};
} else
@@ -7562,8 +7635,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
+ VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
+ if (!hasPlanWithVF(VF.Width)) {
+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
+ << ".\n");
+ return std::nullopt;
+ }
return VF;
}
@@ -7614,43 +7692,51 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
-void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
- VPlan &BestVPlan,
- InnerLoopVectorizer &ILV,
- DominatorTree *DT,
- bool IsEpilogueVectorization) {
+SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+ ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
+ InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
+ DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
assert(BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF");
assert(BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF");
+ assert(
+ (IsEpilogueVectorization || !ExpandedSCEVs) &&
+ "expanded SCEVs to reuse can only be used during epilogue vectorization");
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
- // Workaround! Compute the trip count of the original loop and cache it
- // before we start modifying the CFG. This code has a systemic problem
- // wherein it tries to run analysis over partially constructed IR; this is
- // wrong, and not simply for SCEV. The trip count of the original loop
- // simply happens to be prone to hitting this in practice. In theory, we
- // can hit the same issue for any SCEV, or ValueTracking query done during
- // mutation. See PR49900.
- ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
-
if (!IsEpilogueVectorization)
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
+ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+
+ // 0. Generate SCEV-dependent code into the preheader, including TripCount,
+ // before making any changes to the CFG.
+ if (!BestVPlan.getPreheader()->empty()) {
+ State.CFG.PrevBB = OrigLoop->getLoopPreheader();
+ State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
+ BestVPlan.getPreheader()->execute(&State);
+ }
+ if (!ILV.getTripCount())
+ ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
+ else
+ assert(IsEpilogueVectorization && "should only re-use the existing trip "
+ "count during epilogue vectorization");
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
Value *CanonicalIVStartValue;
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
- ILV.createVectorizedLoopSkeleton();
+ ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
+ : State.ExpandedSCEVs);
// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
const LoopAccessInfo *LAI = ILV.Legal->getLAI();
+ std::unique_ptr<LoopVersioning> LVer = nullptr;
if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
!LAI->getRuntimePointerChecking()->getDiffChecks()) {
@@ -7658,9 +7744,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// still use it to add the noalias metadata.
// TODO: Find a better way to re-use LoopVersioning functionality to add
// metadata.
- State.LVer = std::make_unique<LoopVersioning>(
+ LVer = std::make_unique<LoopVersioning>(
*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
PSE.getSE());
+ State.LVer = &*LVer;
State.LVer->prepareNoAliasMetadata();
}
@@ -7677,10 +7764,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
- ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State,
- IsEpilogueVectorization);
+ BestVPlan.prepareToExecute(
+ ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State, IsEpilogueVectorization);
BestVPlan.execute(&State);
@@ -7706,13 +7792,18 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
- AddRuntimeUnrollDisableMetaData(L);
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+ AddRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State, BestVPlan);
ILV.printDebugTracesAtEnd();
+
+ return State.ExpandedSCEVs;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -7725,8 +7816,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
}
#endif
-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-
//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//
@@ -7734,7 +7823,8 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("");
// Generate the code to check the minimum iteration count of the vector
@@ -7795,7 +7885,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
assert(Bypass && "Expected valid bypass basic block.");
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -7803,8 +7893,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Generate code to check if the loop's trip count is less than VF * UF of the
// main vector loop.
- auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
+ : VF.isVector())
+ ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
@@ -7824,7 +7916,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Update dominator for Bypass & LoopExit.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
// For loops with multiple exits, there's no edge from the middle block
// to exit blocks (as the epilogue must run) and thus no need to update
// the immediate dominator of the exit blocks.
@@ -7852,7 +7944,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
@@ -7891,7 +7984,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -7950,7 +8043,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues({VecEpilogueIterationCountCheck,
+ createInductionResumeValues(ExpandedSCEVs,
+ {VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);
return {completeLoopSkeleton(), EPResumeVal};
@@ -7972,8 +8066,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
// Generate code to check if the loop's trip count is less than VF * UF of the
// vector epilogue loop.
- auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
+ ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters =
Builder.CreateICmp(P, Count,
@@ -8008,8 +8103,7 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
assert(!Range.isEmpty() && "Trying to test an empty VF range.");
bool PredicateAtRangeStart = Predicate(Range.Start);
- for (ElementCount TmpVF = Range.Start * 2;
- ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
+ for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
if (Predicate(TmpVF) != PredicateAtRangeStart) {
Range.End = TmpVF;
break;
@@ -8025,16 +8119,16 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// buildVPlan().
void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
ElementCount MaxVF) {
- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
- VFRange SubRange = {VF, MaxVFPlusOne};
+ auto MaxVFTimes2 = MaxVF * 2;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
}
VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
- VPlanPtr &Plan) {
+ VPlan &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
@@ -8058,7 +8152,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
- VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
+ VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
@@ -8069,7 +8163,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
// 'select i1 SrcMask, i1 EdgeMask, i1 false'.
// The select version does not introduce new UB if SrcMask is false and
// EdgeMask is poison. Using 'and' here introduces undefined behavior.
- VPValue *False = Plan->getOrAddVPValue(
+ VPValue *False = Plan.getVPValueOrAddLiveIn(
ConstantInt::getFalse(BI->getCondition()->getType()));
EdgeMask =
Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
@@ -8078,7 +8172,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = EdgeMask;
}
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
// Look for cached value.
@@ -8098,29 +8192,28 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// If we're using the active lane mask for control flow, then we get the
// mask from the active lane mask PHI that is cached in the VPlan.
- PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
- if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
- return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
+ TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
+ if (useActiveLaneMaskForControlFlow(TFStyle))
+ return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
// constructing the desired canonical IV in the header block as its first
// non-phi instructions.
- VPBasicBlock *HeaderVPBB =
- Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (EmitGetActiveLaneMask != PredicationStyle::None) {
- VPValue *TC = Plan->getOrCreateTripCount();
+ if (useActiveLaneMask(TFStyle)) {
+ VPValue *TC = Plan.getTripCount();
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
nullptr, "active.lane.mask");
} else {
- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8168,7 +8261,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), Plan);
+ Mask = createBlockInMask(I->getParent(), *Plan);
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -8189,22 +8282,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
/// insert a recipe to expand the step for the induction recipe.
-static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
- PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
- const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
- // Returns true if an instruction \p I should be scalarized instead of
- // vectorized for the chosen vectorization factor.
- auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
- return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF);
- };
-
- bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) {
- return ShouldScalarizeInstruction(PhiOrTrunc, VF);
- },
- Range);
+static VPWidenIntOrFpInductionRecipe *
+createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
+ VPValue *Start, const InductionDescriptor &IndDesc,
+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
+ VFRange &Range) {
assert(IndDesc.getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
@@ -8213,12 +8295,10 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
- !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
- !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
}
VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8227,14 +8307,13 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
- return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
+ return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
*PSE.getSE(), *OrigLoop, Range);
// Check if this is pointer induction. If so, build the recipe for it.
if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
*PSE.getSE());
- assert(isa<SCEVConstant>(II->getStep()));
return new VPWidenPointerInductionRecipe(
Phi, Operands[0], Step, *II,
LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8267,9 +8346,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
- VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
- *PSE.getSE(), *OrigLoop, Range);
+ VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
+ return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
+ *OrigLoop, Range);
}
return nullptr;
}
@@ -8309,7 +8388,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
assert((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask");
OperandsWithMask.push_back(Operands[In]);
@@ -8321,8 +8400,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
- VFRange &Range) const {
-
+ VFRange &Range,
+ VPlanPtr &Plan) {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
@@ -8339,17 +8418,17 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
+ SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {
- bool NeedToScalarize = false;
+ Function *Variant;
// Is it beneficial to perform intrinsic call compared to lib
// call?
InstructionCost CallCost =
- CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ CM.getVectorCallCost(CI, VF, &Variant);
InstructionCost IntrinsicCost =
CM.getVectorIntrinsicCost(CI, VF);
return IntrinsicCost <= CallCost;
@@ -8358,6 +8437,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
+ Function *Variant = nullptr;
+ ElementCount VariantVF;
+ bool NeedsMask = false;
// Is better to call a vectorized version of the function than to to scalarize
// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8365,14 +8447,57 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// The following case may be scalarized depending on the VF.
// The flag shows whether we can use a usual Call for vectorized
// version of the instruction.
- bool NeedToScalarize = false;
- CM.getVectorCallCost(CI, VF, NeedToScalarize);
- return !NeedToScalarize;
+
+ // If we've found a variant at a previous VF, then stop looking. A
+ // vectorized variant of a function expects input in a certain shape
+ // -- basically the number of input registers, the number of lanes
+ // per register, and whether there's a mask required.
+ // We store a pointer to the variant in the VPWidenCallRecipe, so
+ // once we have an appropriate variant it's only valid for that VF.
+ // This will force a different vplan to be generated for each VF that
+ // finds a valid variant.
+ if (Variant)
+ return false;
+ CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
+ // If we found a valid vector variant at this VF, then store the VF
+ // in case we need to generate a mask.
+ if (Variant)
+ VariantVF = VF;
+ return Variant != nullptr;
},
Range);
- if (ShouldUseVectorCall)
+ if (ShouldUseVectorCall) {
+ if (NeedsMask) {
+ // We have 2 cases that would require a mask:
+ // 1) The block needs to be predicated, either due to a conditional
+ // in the scalar loop or use of an active lane mask with
+ // tail-folding, and we use the appropriate mask for the block.
+ // 2) No mask is required for the block, but the only available
+ // vector variant at this VF requires a mask, so we synthesize an
+ // all-true mask.
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(CI))
+ Mask = createBlockInMask(CI->getParent(), *Plan);
+ else
+ Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
+ IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
+
+ VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);
+ unsigned MaskPos = 0;
+
+ for (const VFInfo &Info : VFDatabase::getMappings(*CI))
+ if (Info.Shape == Shape) {
+ assert(Info.isMasked() && "Vector function info shape mismatch");
+ MaskPos = Info.getParamIndexForOptionalMask().value();
+ break;
+ }
+
+ Ops.insert(Ops.begin() + MaskPos, Mask);
+ }
+
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
- Intrinsic::not_intrinsic);
+ Intrinsic::not_intrinsic, Variant);
+ }
return nullptr;
}
@@ -8405,9 +8530,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
- VPValue *Mask = createBlockInMask(I->getParent(), Plan);
- VPValue *One =
- Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
+ VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
+ VPValue *One = Plan->getVPValueOrAddLiveIn(
+ ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
I->getDebugLoc());
@@ -8415,38 +8540,26 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
}
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case Instruction::Add:
case Instruction::And:
case Instruction::AShr:
- case Instruction::BitCast:
case Instruction::FAdd:
case Instruction::FCmp:
case Instruction::FDiv:
case Instruction::FMul:
case Instruction::FNeg:
- case Instruction::FPExt:
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::FPTrunc:
case Instruction::FRem:
case Instruction::FSub:
case Instruction::ICmp:
- case Instruction::IntToPtr:
case Instruction::LShr:
case Instruction::Mul:
case Instruction::Or:
- case Instruction::PtrToInt:
case Instruction::Select:
- case Instruction::SExt:
case Instruction::Shl:
- case Instruction::SIToFP:
case Instruction::Sub:
- case Instruction::Trunc:
- case Instruction::UIToFP:
case Instruction::Xor:
- case Instruction::ZExt:
case Instruction::Freeze:
return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
};
@@ -8462,9 +8575,9 @@ void VPRecipeBuilder::fixHeaderPhis() {
}
}
-VPBasicBlock *VPRecipeBuilder::handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan) {
+VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
+ VFRange &Range,
+ VPlan &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
@@ -8501,83 +8614,22 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
break;
}
}
-
- auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
- IsUniform, IsPredicated);
-
- // Find if I uses a predicated instruction. If so, it will use its scalar
- // value. Avoid hoisting the insert-element which packs the scalar value into
- // a vector value, as that happens iff all users use the vector value.
- for (VPValue *Op : Recipe->operands()) {
- auto *PredR =
- dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
- if (!PredR)
- continue;
- auto *RepR = cast<VPReplicateRecipe>(
- PredR->getOperand(0)->getDefiningRecipe());
- assert(RepR->isPredicated() &&
- "expected Replicate recipe to be predicated");
- RepR->setAlsoPack(false);
- }
-
- // Finalize the recipe for Instr, first if it is not predicated.
+ VPValue *BlockInMask = nullptr;
if (!IsPredicated) {
+ // Finalize the recipe for Instr, first if it is not predicated.
LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
- setRecipe(I, Recipe);
- Plan->addVPValue(I, Recipe);
- VPBB->appendRecipe(Recipe);
- return VPBB;
- }
- LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
-
- VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
- assert(SingleSucc && "VPBB must have a single successor when handling "
- "predicated replication.");
- VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
- // Record predicated instructions for above packing optimizations.
- VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
- VPBlockUtils::insertBlockAfter(Region, VPBB);
- auto *RegSucc = new VPBasicBlock();
- VPBlockUtils::insertBlockAfter(RegSucc, Region);
- VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
- return RegSucc;
-}
-
-VPRegionBlock *
-VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
- VPlanPtr &Plan) {
- Instruction *Instr = PredRecipe->getUnderlyingInstr();
- // Instructions marked for predication are replicated and placed under an
- // if-then construct to prevent side-effects.
- // Generate recipes to compute the block mask for this region.
- VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
-
- // Build the triangular if-then region.
- std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
- assert(Instr->getParent() && "Predicated instruction not in any basic block");
- auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
- auto *PHIRecipe = Instr->getType()->isVoidTy()
- ? nullptr
- : new VPPredInstPHIRecipe(PredRecipe);
- if (PHIRecipe) {
- setRecipe(Instr, PHIRecipe);
- Plan->addVPValue(Instr, PHIRecipe);
} else {
- setRecipe(Instr, PredRecipe);
- Plan->addVPValue(Instr, PredRecipe);
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ // Instructions marked for predication are replicated and a mask operand is
+ // added initially. Masked replicate recipes will later be placed under an
+ // if-then construct to prevent side-effects. Generate recipes to compute
+ // the block mask for this region.
+ BlockInMask = createBlockInMask(I->getParent(), Plan);
}
- auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
-
- // Note: first set Entry as region entry and then connect successors starting
- // from it in order, to propagate the "parent" of each VPBasicBlock.
- VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
- VPBlockUtils::connectBlocks(Pred, Exiting);
-
- return Region;
+ auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
+ IsUniform, BlockInMask);
+ return toVPRecipeResult(Recipe);
}
VPRecipeOrVPValueTy
@@ -8643,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return nullptr;
if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
@@ -8653,13 +8705,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
return toVPRecipeResult(new VPWidenGEPRecipe(
- GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
+ GEP, make_range(Operands.begin(), Operands.end())));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- bool InvariantCond =
- PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
return toVPRecipeResult(new VPWidenSelectRecipe(
- *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
+ *SI, make_range(Operands.begin(), Operands.end())));
+ }
+
+ if (auto *CI = dyn_cast<CastInst>(Instr)) {
+ return toVPRecipeResult(
+ new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
}
return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
@@ -8677,34 +8732,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
auto &ConditionalAssumes = Legal->getConditionalAssumes();
DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
- MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- // Dead instructions do not need sinking. Remove them from SinkAfter.
- for (Instruction *I : DeadInstructions)
- SinkAfter.erase(I);
-
- // Cannot sink instructions after dead instructions (there won't be any
- // recipes for them). Instead, find the first non-dead previous instruction.
- for (auto &P : Legal->getSinkAfter()) {
- Instruction *SinkTarget = P.second;
- Instruction *FirstInst = &*SinkTarget->getParent()->begin();
- (void)FirstInst;
- while (DeadInstructions.contains(SinkTarget)) {
- assert(
- SinkTarget != FirstInst &&
- "Must find a live instruction (at least the one feeding the "
- "fixed-order recurrence PHI) before reaching beginning of the block");
- SinkTarget = SinkTarget->getPrevNode();
- assert(SinkTarget != P.first &&
- "sink source equals target, no sinking required");
- }
- P.second = SinkTarget;
- }
-
- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
- VFRange SubRange = {VF, MaxVFPlusOne};
- VPlans.push_back(
- buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
+ auto MaxVFTimes2 = MaxVF * 2;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
+ VPlans.push_back(std::move(*Plan));
VF = SubRange.End;
}
}
@@ -8712,10 +8744,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// Add the necessary canonical IV and branch recipes required to control the
// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- bool HasNUW,
- bool UseLaneMaskForLoopControlFlow) {
+ TailFoldingStyle Style) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
- auto *StartV = Plan.getOrAddVPValue(StartIdx);
+ auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
@@ -8725,6 +8756,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
+ bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
: VPInstruction::CanonicalIVIncrement,
@@ -8732,11 +8764,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- EB->appendRecipe(CanonicalIVIncrement);
-
- if (UseLaneMaskForLoopControlFlow) {
+ if (useActiveLaneMaskForControlFlow(Style)) {
// Create the active lane mask instruction in the vplan preheader.
- VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
+ VPBasicBlock *VecPreheader =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
@@ -8745,14 +8776,34 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
: VPInstruction::CanonicalIVIncrementForPart,
{StartV}, DL, "index.part.next");
- Preheader->appendRecipe(CanonicalIVIncrementParts);
+ VecPreheader->appendRecipe(CanonicalIVIncrementParts);
// Create the ActiveLaneMask instruction using the correct start values.
- VPValue *TC = Plan.getOrCreateTripCount();
+ VPValue *TC = Plan.getTripCount();
+
+ VPValue *TripCount, *IncrementValue;
+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+ // When avoiding a runtime check, the active.lane.mask inside the loop
+ // uses a modified trip count and the induction variable increment is
+ // done after the active.lane.mask intrinsic is called.
+ auto *TCMinusVF =
+ new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
+ VecPreheader->appendRecipe(TCMinusVF);
+ IncrementValue = CanonicalIVPHI;
+ TripCount = TCMinusVF;
+ } else {
+ // When the loop is guarded by a runtime overflow check for the loop
+ // induction variable increment by VF, we can increment the value before
+ // the get.active.lane mask and use the unmodified tripcount.
+ EB->appendRecipe(CanonicalIVIncrement);
+ IncrementValue = CanonicalIVIncrement;
+ TripCount = TC;
+ }
+
auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
{CanonicalIVIncrementParts, TC}, DL,
"active.lane.mask.entry");
- Preheader->appendRecipe(EntryALM);
+ VecPreheader->appendRecipe(EntryALM);
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
@@ -8763,15 +8814,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
CanonicalIVIncrementParts =
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
: VPInstruction::CanonicalIVIncrementForPart,
- {CanonicalIVIncrement}, DL);
+ {IncrementValue}, DL);
EB->appendRecipe(CanonicalIVIncrementParts);
auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TC}, DL,
+ {CanonicalIVIncrementParts, TripCount}, DL,
"active.lane.mask.next");
EB->appendRecipe(ALM);
LaneMaskPhi->addOperand(ALM);
+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+ // Do the increment of the canonical IV after the active.lane.mask, because
+ // that value is still based off %CanonicalIVPHI
+ EB->appendRecipe(CanonicalIVIncrement);
+ }
+
// We have to invert the mask here because a true condition means jumping
// to the exit block.
auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
@@ -8781,6 +8838,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
EB->appendRecipe(BranchBack);
} else {
+ EB->appendRecipe(CanonicalIVIncrement);
+
// Add the BranchOnCount VPInstruction to the latch.
VPInstruction *BranchBack = new VPInstruction(
VPInstruction::BranchOnCount,
@@ -8804,14 +8863,13 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
for (PHINode &ExitPhi : ExitBB->phis()) {
Value *IncomingValue =
ExitPhi.getIncomingValueForBlock(ExitingBB);
- VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
+ VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
Plan.addLiveOut(&ExitPhi, V);
}
}
-VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const MapVector<Instruction *, Instruction *> &SinkAfter) {
+std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8822,12 +8880,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// process after constructing the initial VPlan.
// ---------------------------------------------------------------------------
- // Mark instructions we'll need to sink later and their targets as
- // ingredients whose recipe we'll need to record.
- for (const auto &Entry : SinkAfter) {
- RecipeBuilder.recordRecipeOf(Entry.first);
- RecipeBuilder.recordRecipeOf(Entry.second);
- }
for (const auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurKind Kind =
@@ -8852,9 +8904,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
auto applyIG = [IG, this](ElementCount VF) -> bool {
- return (VF.isVector() && // Query is illegal for VF == 1
- CM.getWideningDecision(IG->getInsertPos(), VF) ==
- LoopVectorizationCostModel::CM_Interleave);
+ bool Result = (VF.isVector() && // Query is illegal for VF == 1
+ CM.getWideningDecision(IG->getInsertPos(), VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ // For scalable vectors, the only interleave factor currently supported
+ // is 2 since we require the (de)interleave2 intrinsics instead of
+ // shufflevectors.
+ assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+ "Unsupported interleave factor for scalable vectors");
+ return Result;
};
if (!getDecisionAndClampRange(applyIG, Range))
continue;
@@ -8869,26 +8927,34 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, starting with a block for the pre-header,
- // followed by a region for the vector loop, followed by the middle block. The
- // skeleton vector loop region contains a header and latch block.
- VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
- auto Plan = std::make_unique<VPlan>(Preheader);
-
+ // Create initial VPlan skeleton, having a basic block for the pre-header
+ // which contains SCEV expansions that need to happen before the CFG is
+ // modified; a basic block for the vector pre-header, followed by a region for
+ // the vector loop, followed by the middle basic block. The skeleton vector
+ // loop region contains a header and latch basic blocks.
+ VPlanPtr Plan = VPlan::createInitialVPlan(
+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+ *PSE.getSE());
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
+ VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+ // Don't use getDecisionAndClampRange here, because we don't know the UF
+ // so this function is better to be conservative, rather than to split
+ // it up into different VPlans.
+ bool IVUpdateMayOverflow = false;
+ for (ElementCount VF : Range)
+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+
Instruction *DLInst =
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- !CM.foldTailByMasking(),
- CM.useActiveLaneMaskForControlFlow());
+ CM.getTailFoldingStyle(IVUpdateMayOverflow));
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -8896,18 +8962,16 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
DFS.perform(LI);
VPBasicBlock *VPBB = HeaderVPBB;
- SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
- unsigned VPBBsForBB = 0;
if (VPBB != HeaderVPBB)
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : BB->instructionsWithoutDebug()) {
+ for (Instruction &I : BB->instructionsWithoutDebug(false)) {
Instruction *Instr = &I;
// First filter out irrelevant instructions, to ensure no recipes are
@@ -8918,7 +8982,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
- Operands.push_back(Plan->getOrAddVPValue(
+ Operands.push_back(Plan->getVPValueOrAddLiveIn(
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
} else {
auto OpRange = Plan->mapToVPValues(Instr->operands());
@@ -8932,50 +8996,36 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
- if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
- Instr, Operands, Range, VPBB, Plan)) {
- // If Instr can be simplified to an existing VPValue, use it.
- if (RecipeOrValue.is<VPValue *>()) {
- auto *VPV = RecipeOrValue.get<VPValue *>();
- Plan->addVPValue(Instr, VPV);
- // If the re-used value is a recipe, register the recipe for the
- // instruction, in case the recipe for Instr needs to be recorded.
- if (VPRecipeBase *R = VPV->getDefiningRecipe())
- RecipeBuilder.setRecipe(Instr, R);
- continue;
- }
- // Otherwise, add the new recipe.
- VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
- for (auto *Def : Recipe->definedValues()) {
- auto *UV = Def->getUnderlyingValue();
- Plan->addVPValue(UV, Def);
- }
-
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
- // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
- // of the header block. That can happen for truncates of induction
- // variables. Those recipes are moved to the phi section of the header
- // block after applying SinkAfter, which relies on the original
- // position of the trunc.
- assert(isa<TruncInst>(Instr));
- InductionsToMove.push_back(
- cast<VPWidenIntOrFpInductionRecipe>(Recipe));
- }
- RecipeBuilder.setRecipe(Instr, Recipe);
- VPBB->appendRecipe(Recipe);
+ auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
+ Instr, Operands, Range, VPBB, Plan);
+ if (!RecipeOrValue)
+ RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
+ // If Instr can be simplified to an existing VPValue, use it.
+ if (isa<VPValue *>(RecipeOrValue)) {
+ auto *VPV = cast<VPValue *>(RecipeOrValue);
+ Plan->addVPValue(Instr, VPV);
+ // If the re-used value is a recipe, register the recipe for the
+ // instruction, in case the recipe for Instr needs to be recorded.
+ if (VPRecipeBase *R = VPV->getDefiningRecipe())
+ RecipeBuilder.setRecipe(Instr, R);
continue;
}
-
- // Otherwise, if all widening options failed, Instruction is to be
- // replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB =
- RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
- if (NextVPBB != VPBB) {
- VPBB = NextVPBB;
- VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
- : "");
+ // Otherwise, add the new recipe.
+ VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
+ for (auto *Def : Recipe->definedValues()) {
+ auto *UV = Def->getUnderlyingValue();
+ Plan->addVPValue(UV, Def);
}
+
+ RecipeBuilder.setRecipe(Instr, Recipe);
+ if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
+ HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
+ // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the
+ // phi section of HeaderVPBB.
+ assert(isa<TruncInst>(Instr));
+ Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+ } else
+ VPBB->appendRecipe(Recipe);
}
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
@@ -8985,7 +9035,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// After here, VPBB should not be used.
VPBB = nullptr;
- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+ if (CM.requiresScalarEpilogue(Range)) {
+ // No edge from the middle block to the unique exit block has been inserted
+ // and there is nothing to fix from vector loop; phis should have incoming
+ // from scalar loop only.
+ } else
+ addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8998,116 +9053,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
- // Apply Sink-After legal constraints.
- auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
- if (Region && Region->isReplicator()) {
- assert(Region->getNumSuccessors() == 1 &&
- Region->getNumPredecessors() == 1 && "Expected SESE region!");
- assert(R->getParent()->size() == 1 &&
- "A recipe in an original replicator region must be the only "
- "recipe in its block");
- return Region;
- }
- return nullptr;
- };
- for (const auto &Entry : SinkAfter) {
- VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
- VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
-
- auto *TargetRegion = GetReplicateRegion(Target);
- auto *SinkRegion = GetReplicateRegion(Sink);
- if (!SinkRegion) {
- // If the sink source is not a replicate region, sink the recipe directly.
- if (TargetRegion) {
- // The target is in a replication region, make sure to move Sink to
- // the block after it, not into the replication region itself.
- VPBasicBlock *NextBlock =
- cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
- Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
- } else
- Sink->moveAfter(Target);
- continue;
- }
-
- // The sink source is in a replicate region. Unhook the region from the CFG.
- auto *SinkPred = SinkRegion->getSinglePredecessor();
- auto *SinkSucc = SinkRegion->getSingleSuccessor();
- VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
- VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
- VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
-
- if (TargetRegion) {
- // The target recipe is also in a replicate region, move the sink region
- // after the target region.
- auto *TargetSucc = TargetRegion->getSingleSuccessor();
- VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
- VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
- VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
- } else {
- // The sink source is in a replicate region, we need to move the whole
- // replicate region, which should only contain a single recipe in the
- // main block.
- auto *SplitBlock =
- Target->getParent()->splitAt(std::next(Target->getIterator()));
-
- auto *SplitPred = SplitBlock->getSinglePredecessor();
-
- VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
- VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
- VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
- }
- }
-
- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
- VPlanTransforms::removeRedundantInductionCasts(*Plan);
-
- // Now that sink-after is done, move induction recipes for optimized truncates
- // to the phi section of the header block.
- for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
- Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
-
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
RecipeBuilder, Range.Start);
- // Introduce a recipe to combine the incoming and previous values of a
- // fixed-order recurrence.
- for (VPRecipeBase &R :
- Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
- if (!RecurPhi)
- continue;
-
- VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
- // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
- // to terminate.
- while (auto *PrevPhi =
- dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
- PrevRecipe = &PrevPhi->getBackedgeRecipe();
- VPBasicBlock *InsertBlock = PrevRecipe->getParent();
- auto *Region = GetReplicateRegion(PrevRecipe);
- if (Region)
- InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
- if (!InsertBlock) {
- InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
- VPBlockUtils::insertBlockAfter(InsertBlock, Region);
- }
- if (Region || PrevRecipe->isPhi())
- Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
- else
- Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
-
- auto *RecurSplice = cast<VPInstruction>(
- Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
- {RecurPhi, RecurPhi->getBackedgeValue()}));
-
- RecurPhi->replaceAllUsesWith(RecurSplice);
- // Set the first operand of RecurSplice to RecurPhi again, after replacing
- // all users.
- RecurSplice->setOperand(0, RecurPhi);
- }
-
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
@@ -9122,48 +9071,66 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
StoredValues.push_back(StoreR->getStoredValue());
}
+ bool NeedsMaskForGaps =
+ IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
- Recipe->getMask());
+ Recipe->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(Recipe);
unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
+ VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
if (!Member->getType()->isVoidTy()) {
- VPValue *OriginalV = Plan->getVPValue(Member);
- Plan->removeVPValueFor(Member);
- Plan->addVPValue(Member, VPIG->getVPValue(J));
+ VPValue *OriginalV = MemberR->getVPSingleValue();
OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
J++;
}
- RecipeBuilder.getRecipe(Member)->eraseFromParent();
+ MemberR->eraseFromParent();
}
}
- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
- VF *= 2)
+ for (ElementCount VF : Range)
Plan->addVF(VF);
Plan->setName("Initial VPlan");
+ // Replace VPValues for known constant strides guaranteed by predicate scalar
+ // evolution.
+ for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
+ auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
+ auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
+ // Only handle constant strides for now.
+ if (!ScevStride)
+ continue;
+ Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
+
+ auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
+ // The versioned value may not be used in the loop directly, so just add a
+ // new live-in in those cases.
+ Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
+ }
+
// From this point onwards, VPlan-to-VPlan transformations may change the plan
// in ways that accessing values using original IR values is incorrect.
Plan->disableValue2VPValue();
+ // Sink users of fixed-order recurrence past the recipe defining the previous
+ // value and introduce FirstOrderRecurrenceSplice VPInstructions.
+ if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
+ return std::nullopt;
+
+ VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
+ VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
VPlanTransforms::removeDeadRecipes(*Plan);
- bool ShouldSimplify = true;
- while (ShouldSimplify) {
- ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
- ShouldSimplify |=
- VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
- ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
- }
+ VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);
VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
- return Plan;
+ return std::make_optional(std::move(Plan));
}
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
@@ -9175,21 +9142,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
// Create new empty VPlan
- auto Plan = std::make_unique<VPlan>();
+ auto Plan = VPlan::createInitialVPlan(
+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+ *PSE.getSE());
// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();
- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
- VF *= 2)
+ for (ElementCount VF : Range)
Plan->addVF(VF);
- SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
- OrigLoop, Plan,
+ Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
- DeadInstructions, *PSE.getSE(), *TLI);
+ *PSE.getSE(), *TLI);
// Remove the existing terminator of the exiting block of the top-most region.
// A BranchOnCount will be added instead when adding the canonical IV recipes.
@@ -9198,7 +9165,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Term->eraseFromParent();
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- true, CM.useActiveLaneMaskForControlFlow());
+ CM.getTailFoldingStyle());
return Plan;
}
@@ -9255,7 +9222,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(WidenRecipe->getParent(),
WidenRecipe->getIterator());
- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
+ CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);
}
if (IsFMulAdd) {
@@ -9270,7 +9237,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VecOp = FMulRecipe;
}
VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
@@ -9304,13 +9271,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (!PhiR || PhiR->isInLoop())
continue;
VPValue *Cond =
- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Red = PhiR->getBackedgeValue();
assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
}
}
+
+ VPlanTransforms::clearReductionWrapFlags(*Plan);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -9475,7 +9444,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
- Value *Step = State.get(getOperand(1), VPIteration(0, Part));
+ Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
Value *SclrGep = emitTransformedIndex(
State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
SclrGep->setName("next.gep");
@@ -9485,8 +9454,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
return;
}
- assert(isa<SCEVConstant>(IndDesc.getStep()) &&
- "Induction step not a SCEV constant!");
Type *PhiType = IndDesc.getStep()->getType();
// Build a pointer phi
@@ -9506,7 +9473,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
Value *NumUnrolledElems =
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
Value *InductionGEP = GetElementPtrInst::Create(
- IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
InductionLoc);
// Add induction update using an incorrect block temporarily. The phi node
@@ -9529,10 +9496,10 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
StartOffset = State.Builder.CreateAdd(
StartOffset, State.Builder.CreateStepVector(VecPhiType));
- assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
+ assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
"scalar step must be the same across all parts");
Value *GEP = State.Builder.CreateGEP(
- IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(
StartOffset,
State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
@@ -9584,7 +9551,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
- getStoredValues(), getMask());
+ getStoredValues(), getMask(),
+ NeedsMaskForGaps);
}
void VPReductionRecipe::execute(VPTransformState &State) {
@@ -9640,10 +9608,9 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
Instruction *UI = getUnderlyingInstr();
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(UI, this, *State.Instance,
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
// Insert scalar instance packing it into a vector.
- if (AlsoPack && State.VF.isVector()) {
+ if (State.VF.isVector() && shouldPack()) {
// If we're constructing lane 0, initialize to start from poison.
if (State.Instance->Lane.isFirstLane()) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
@@ -9663,8 +9630,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
all_of(operands(), [](VPValue *Op) {
return Op->isDefinedOutsideVectorRegions();
})) {
- State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
- State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
if (user_begin() != user_end()) {
for (unsigned Part = 1; Part < State.UF; ++Part)
State.set(this, State.get(this, VPIteration(0, 0)),
@@ -9676,16 +9642,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
// Uniform within VL means we need to generate lane 0 only for each
// unrolled copy.
for (unsigned Part = 0; Part < State.UF; ++Part)
- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
return;
}
- // A store of a loop varying value to a loop invariant address only
- // needs only the last copy of the store.
- if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
+ // A store of a loop varying value to a uniform address only needs the last
+ // copy of the store.
+ if (isa<StoreInst>(UI) &&
+ vputils::isUniformAfterVectorization(getOperand(1))) {
auto Lane = VPLane::getLastLaneForVF(State.VF);
- State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
State);
return;
}
@@ -9695,8 +9661,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
@@ -9714,7 +9679,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGatherScatter = !Consecutive;
+ bool CreateGatherScatter = !isConsecutive();
auto &Builder = State.Builder;
InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
@@ -9725,36 +9690,39 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
- GetElementPtrInst *PartPtr = nullptr;
-
+ Value *PartPtr = nullptr;
+
+ // Use i32 for the gep index type when the value is constant,
+ // or query DataLayout for a more suitable index type otherwise.
+ const DataLayout &DL =
+ Builder.GetInsertBlock()->getModule()->getDataLayout();
+ Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
+ ? DL.getIndexType(ScalarDataTy->getPointerTo())
+ : Builder.getInt32Ty();
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();
- if (Reverse) {
+ if (isReverse()) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
// RunTimeVF = VScale * VF.getKnownMinValue()
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+ Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
// NumElt = -Part * RunTimeVF
- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ Value *NumElt =
+ Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
// LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ Value *LastLane =
+ Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
PartPtr =
- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
- PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
- PartPtr->setIsInBounds(InBounds);
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] =
Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
} else {
- Value *Increment =
- createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
- PartPtr->setIsInBounds(InBounds);
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
}
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
@@ -9774,7 +9742,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
} else {
- if (Reverse) {
+ if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
@@ -9833,7 +9801,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
static ScalarEpilogueLowering getScalarEpilogueLowering(
Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
@@ -9869,7 +9836,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};
// 4) if the TTI hook indicates this is profitable, request predication.
- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
+ TailFoldingInfo TFI(TLI, &LVL, IAI);
+ if (TTI->preferPredicateOverEpilogue(&TFI))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;
@@ -9880,9 +9848,29 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
if (hasVectorValue(Def, Part))
return Data.PerPartOutput[Def][Part];
+ auto GetBroadcastInstrs = [this, Def](Value *V) {
+ bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
+ if (VF.isScalar())
+ return V;
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (SafeToHoist) {
+ BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
+ Plan->getVectorLoopRegion()->getSinglePredecessor())];
+ if (LoopVectorPreHeader)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ }
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+ };
+
if (!hasScalarValue(Def, {Part, 0})) {
Value *IRV = Def->getLiveInIRValue();
- Value *B = ILV->getBroadcastInstrs(IRV);
+ Value *B = GetBroadcastInstrs(IRV);
set(Def, B, Part);
return B;
}
@@ -9900,9 +9888,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
- // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
+ // VPExpandSCEVRecipes can also be uniform.
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -9927,7 +9917,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
// State, we will only generate the insertelements once.
Value *VectorValue = nullptr;
if (IsUniform) {
- VectorValue = ILV->getBroadcastInstrs(ScalarValue);
+ VectorValue = GetBroadcastInstrs(ScalarValue);
set(Def, VectorValue, Part);
} else {
// Initialize packing with insertelements to start from undef.
@@ -9962,15 +9952,15 @@ static bool processLoopInVPlanNativePath(
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10231,8 +10221,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
@@ -10309,11 +10299,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
- CM.collectValuesToIgnore();
- CM.collectElementTypesForWidening();
-
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
+ ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -10342,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
- !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
+ !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
*PSE.getSE())) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
@@ -10464,7 +10452,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Consider vectorizing the epilogue too if it's profitable.
VectorizationFactor EpilogueVF =
- CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+ LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
if (EpilogueVF.Width.isVector()) {
// The first pass vectorizes the main loop and creates a scalar epilogue
@@ -10475,8 +10463,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks);
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
- LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
- DT, true);
+ auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+ BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10492,6 +10480,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
+ // Re-use the trip count and steps expanded for the main loop, as
+ // skeleton creation needs it as a value that dominates both the scalar
+ // and vector epilogue loops
+ // TODO: This is a workaround needed for epilogue vectorization and it
+ // should be removed once induction resume value creation is done
+ // directly in VPlan.
+ EpilogILV.setTripCount(MainILV.getTripCount());
+ for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
+ auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
+ auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
+ ExpandedSCEVs.find(ExpandR->getSCEV())->second);
+ ExpandR->replaceAllUsesWith(ExpandedVal);
+ ExpandR->eraseFromParent();
+ }
+
// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
// before vectorizing the epilogue loop.
@@ -10520,15 +10523,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
+ IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ {EPI.MainLoopIterationCountCheck});
}
assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
+ VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT, true);
+ DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
@@ -10581,14 +10585,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizeResult LoopVectorizePass::runImpl(
Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
- DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+ DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
DT = &DT_;
- BFI = &BFI_;
+ BFI = BFI_;
TLI = TLI_;
AC = &AC_;
LAIs = &LAIs_;
@@ -10604,7 +10608,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
// vector registers, loop vectorization may still enable scalar
// interleaving.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
- TTI->getMaxInterleaveFactor(1) < 2)
+ TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
return LoopVectorizeResult(false, false);
bool Changed = false, CFGChanged = false;
@@ -10656,7 +10660,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
@@ -10666,12 +10669,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ BlockFrequencyInfo *BFI = nullptr;
+ if (PSI && PSI->hasProfileSummary())
+ BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
LoopVectorizeResult Result =
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
PreservedAnalyses PA;
+ if (isAssignmentTrackingEnabled(*F.getParent())) {
+ for (auto &BB : F)
+ RemoveRedundantDbgInstrs(&BB);
+ }
+
// We currently do not preserve loopinfo/dominator analyses with outer loop
// vectorization. Until this is addressed, mark these analyses as preserved
// only for non-VPlan-native path.
@@ -10679,6 +10690,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
if (!EnableVPlanNativePath) {
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+
+#ifdef EXPENSIVE_CHECKS
+ SE.verify();
+#endif
}
if (Result.MadeCFGChange) {
@@ -10699,8 +10715,8 @@ void LoopVectorizePass::printPipeline(
static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
OS, MapClassName2PassName);
- OS << "<";
+ OS << '<';
OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
- OS << ">";
+ OS << '>';
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3eb6b1804e7..821a3fa22a85 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -87,7 +87,6 @@
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -126,6 +125,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
+// even if we match a reduction but do not vectorize in the end.
+static cl::opt<bool> AllowHorRdxIdenityOptimization(
+ "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
+ cl::desc("Allow optimization of original scalar identity operations on "
+ "matched horizontal reductions."));
+
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
@@ -287,7 +293,7 @@ static bool isCommutative(Instruction *I) {
/// \returns inserting index of InsertElement or InsertValue instruction,
/// using Offset as base offset for index.
static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
- unsigned Offset = 0) {
+ unsigned Offset = 0) {
int Index = Offset;
if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
@@ -342,16 +348,16 @@ enum class UseMask {
static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
UseMask MaskArg) {
SmallBitVector UseMask(VF, true);
- for (auto P : enumerate(Mask)) {
- if (P.value() == UndefMaskElem) {
+ for (auto [Idx, Value] : enumerate(Mask)) {
+ if (Value == PoisonMaskElem) {
if (MaskArg == UseMask::UndefsAsMask)
- UseMask.reset(P.index());
+ UseMask.reset(Idx);
continue;
}
- if (MaskArg == UseMask::FirstArg && P.value() < VF)
- UseMask.reset(P.value());
- else if (MaskArg == UseMask::SecondArg && P.value() >= VF)
- UseMask.reset(P.value() - VF);
+ if (MaskArg == UseMask::FirstArg && Value < VF)
+ UseMask.reset(Value);
+ else if (MaskArg == UseMask::SecondArg && Value >= VF)
+ UseMask.reset(Value - VF);
}
return UseMask;
}
@@ -374,9 +380,9 @@ static SmallBitVector isUndefVector(const Value *V,
if (!UseMask.empty()) {
const Value *Base = V;
while (auto *II = dyn_cast<InsertElementInst>(Base)) {
+ Base = II->getOperand(0);
if (isa<T>(II->getOperand(1)))
continue;
- Base = II->getOperand(0);
std::optional<unsigned> Idx = getInsertIndex(II);
if (!Idx)
continue;
@@ -461,7 +467,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
- Mask.assign(VL.size(), UndefMaskElem);
+ Mask.assign(VL.size(), PoisonMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
// Undef can be represented as an undef element in a vector.
if (isa<UndefValue>(VL[I]))
@@ -533,6 +539,117 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
return *EI->idx_begin();
}
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static std::optional<TTI::ShuffleKind>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask) {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;
+ SmallVector<int> UndefVectorExtracts;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI) {
+ if (isa<UndefValue>(VL[I]))
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
+ continue;
+ std::optional<unsigned> Idx = getExtractIndex(EI);
+ // Undefined index.
+ if (!Idx) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);
+ ExtractMask.reset(*Idx);
+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);
+ }
+ // Sort the vector operands by the maximum number of uses in extractelements.
+ MapVector<unsigned, SmallVector<Value *>> VFToVector;
+ for (const auto &Data : VectorOpToIdx)
+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
+ .push_back(Data.first);
+ for (auto &Data : VFToVector) {
+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
+ return VectorOpToIdx.find(V1)->second.size() >
+ VectorOpToIdx.find(V2)->second.size();
+ });
+ }
+ // Find the best pair of the vectors with the same number of elements or a
+ // single vector.
+ const int UndefSz = UndefVectorExtracts.size();
+ unsigned SingleMax = 0;
+ Value *SingleVec = nullptr;
+ unsigned PairMax = 0;
+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);
+ for (auto &Data : VFToVector) {
+ Value *V1 = Data.second.front();
+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;
+ SingleVec = V1;
+ }
+ Value *V2 = nullptr;
+ if (Data.second.size() > 1)
+ V2 = *std::next(Data.second.begin());
+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
+ UndefSz) {
+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
+ PairVec = std::make_pair(V1, V2);
+ }
+ }
+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
+ return std::nullopt;
+ // Check if better to perform a shuffle of 2 vectors or just of a single
+ // vector.
+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());
+ SmallVector<Value *> GatheredExtracts(
+ VL.size(), PoisonValue::get(VL.front()->getType()));
+ if (SingleMax >= PairMax && SingleMax) {
+ for (int Idx : VectorOpToIdx[SingleVec])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ } else {
+ for (Value *V : {PairVec.first, PairVec.second})
+ for (int Idx : VectorOpToIdx[V])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ }
+ // Add extracts from undefs too.
+ for (int Idx : UndefVectorExtracts)
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ std::optional<TTI::ShuffleKind> Res =
+ isFixedVectorShuffle(GatheredExtracts, Mask);
+ if (!Res) {
+ // TODO: try to check other subsets if possible.
+ // Restore the original VL if attempt was not successful.
+ VL.swap(SavedVL);
+ return std::nullopt;
+ }
+ // Restore unused scalars from mask, if some of the extractelements were not
+ // selected for shuffle.
+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
+ is_contained(UndefVectorExtracts, I))
+ continue;
+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
+ std::swap(VL[I], GatheredExtracts[I]);
+ }
+ return Res;
+}
+
namespace {
/// Main data required for vectorization of instructions.
@@ -829,18 +946,29 @@ static bool isSimple(Instruction *I) {
}
/// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
+/// one but two input vectors.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
+ bool ExtendingManyInputs = false) {
if (SubMask.empty())
return;
+ assert(
+ (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
+ // Check if input scalars were extended to match the size of other node.
+ (SubMask.size() == Mask.size() &&
+ std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
+ [](int Idx) { return Idx == PoisonMaskElem; }))) &&
+ "SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
return;
}
- SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
- Mask[SubMask[I]] >= TermValue)
+ if (SubMask[I] == PoisonMaskElem ||
+ (!ExtendingManyInputs &&
+ (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
continue;
NewMask[I] = Mask[SubMask[I]];
}
@@ -887,7 +1015,7 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
Mask.clear();
const unsigned E = Indices.size();
- Mask.resize(E, UndefMaskElem);
+ Mask.resize(E, PoisonMaskElem);
for (unsigned I = 0; I < E; ++I)
Mask[Indices[I]] = I;
}
@@ -900,7 +1028,7 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
UndefValue::get(Scalars.front()->getType()));
Prev.swap(Scalars);
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Scalars[Mask[I]] = Prev[I];
}
@@ -962,6 +1090,7 @@ namespace slpvectorizer {
class BoUpSLP {
struct TreeEntry;
struct ScheduleData;
+ class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
public:
@@ -1006,8 +1135,12 @@ public:
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
- Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot = nullptr);
+ /// \param ReplacedExternals containd list of replaced external values
+ /// {scalar, replace} after emitting extractelement for external uses.
+ Value *
+ vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
+ Instruction *ReductionRoot = nullptr);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
@@ -1025,24 +1158,18 @@ public:
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
- /// Checks if the very first tree node is going to be vectorized.
- bool isVectorizedFirstNode() const {
- return !VectorizableTree.empty() &&
- VectorizableTree.front()->State == TreeEntry::Vectorize;
- }
-
- /// Returns the main instruction for the very first node.
- Instruction *getFirstNodeMainOp() const {
- assert(!VectorizableTree.empty() && "No tree to get the first node from");
- return VectorizableTree.front()->getMainOp();
- }
-
/// Returns whether the root node has in-tree uses.
bool doesRootHaveInTreeUses() const {
return !VectorizableTree.empty() &&
!VectorizableTree.front()->UserTreeIndices.empty();
}
+ /// Return the scalars of the root node.
+ ArrayRef<Value *> getRootNodeScalars() const {
+ assert(!VectorizableTree.empty() && "No graph to get the first node from");
+ return VectorizableTree.front()->Scalars;
+ }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -1064,6 +1191,8 @@ public:
MinBWs.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
+ PostponedGathers.clear();
+ ValueToGatherNodes.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -1083,9 +1212,12 @@ public:
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
+ /// \return std::nullopt if ordering is not important, empty order, if
+ /// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+ std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
+ bool TopToBottom);
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1328,8 +1460,14 @@ public:
ConstantInt *Ex1Idx;
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
// Undefs are always profitable for extractelements.
+ // Compiler can easily combine poison and extractelement <non-poison> or
+ // undef and extractelement <poison>. But combining undef +
+ // extractelement <non-poison-but-may-produce-poison> requires some
+ // extra operations.
if (isa<UndefValue>(V2))
- return LookAheadHeuristics::ScoreConsecutiveExtracts;
+ return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
+ ? LookAheadHeuristics::ScoreConsecutiveExtracts
+ : LookAheadHeuristics::ScoreSameOpcode;
Value *EV2 = nullptr;
ConstantInt *Ex2Idx = nullptr;
if (match(V2,
@@ -1683,9 +1821,10 @@ public:
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return std::nullopt.
- std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
- ArrayRef<ReorderingMode> ReorderingModes,
- ArrayRef<Value *> MainAltOps) {
+ std::optional<unsigned>
+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes,
+ ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
@@ -2299,7 +2438,8 @@ private:
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
- ArrayRef<Value *> VectorizedVals);
+ ArrayRef<Value *> VectorizedVals,
+ SmallPtrSetImpl<Value *> &CheckedExtracts);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -2323,15 +2463,13 @@ private:
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
/// for ease of later optimization.
- Value *createBuildVector(const TreeEntry *E);
+ template <typename BVTy, typename ResTy, typename... Args>
+ ResTy processBuildVector(const TreeEntry *E, Args &...Params);
- /// \returns the scalarization cost for this type. Scalarization in this
- /// context means the creation of vectors from a group of scalars. If \p
- /// NeedToShuffle is true, need to add a cost of reshuffling some of the
- /// vector elements.
- InstructionCost getGatherCost(FixedVectorType *Ty,
- const APInt &ShuffledIndices,
- bool NeedToShuffle) const;
+ /// Create a new vector from a list of scalar values. Produces a sequence
+ /// which exploits values reused across lanes, and arranges the inserts
+ /// for ease of later optimization.
+ Value *createBuildVector(const TreeEntry *E);
/// Returns the instruction in the bundle, which can be used as a base point
/// for scheduling. Usually it is the last instruction in the bundle, except
@@ -2354,14 +2492,16 @@ private:
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
- InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
+ /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
+ InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(const TreeEntry *E);
- /// \returns a vector from a collection of scalars in \p VL.
- Value *gather(ArrayRef<Value *> VL);
+ /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
+ /// specified, the starting vector value is poison.
+ Value *gather(ArrayRef<Value *> VL, Value *Root);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
@@ -2400,6 +2540,14 @@ private:
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
+ /// \returns Common mask for reorder indices and reused scalars.
+ SmallVector<int> getCommonMask() const {
+ SmallVector<int> Mask;
+ inversePermutation(ReorderIndices, Mask);
+ ::addMask(Mask, ReuseShuffleIndices);
+ return Mask;
+ }
+
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -2409,8 +2557,8 @@ private:
std::equal(VL.begin(), VL.end(), Mask.begin(),
[Scalars](Value *V, int Idx) {
return (isa<UndefValue>(V) &&
- Idx == UndefMaskElem) ||
- (Idx != UndefMaskElem && V == Scalars[Idx]);
+ Idx == PoisonMaskElem) ||
+ (Idx != PoisonMaskElem && V == Scalars[Idx]);
});
};
if (!ReorderIndices.empty()) {
@@ -2471,7 +2619,7 @@ private:
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
- Value *VectorizedValue = nullptr;
+ WeakTrackingVH VectorizedValue = nullptr;
/// Do we need to gather this sequence or vectorize it
/// (either with vector instruction or with scatter/gather
@@ -2684,20 +2832,22 @@ private:
#ifndef NDEBUG
void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
- InstructionCost VecCost,
- InstructionCost ScalarCost) const {
- dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
+ InstructionCost VecCost, InstructionCost ScalarCost,
+ StringRef Banner) const {
+ dbgs() << "SLP: " << Banner << ":\n";
+ E->dump();
dbgs() << "SLP: Costs:\n";
dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
dbgs() << "SLP: VectorCost = " << VecCost << "\n";
dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
- dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
- ReuseShuffleCost + VecCost - ScalarCost << "\n";
+ dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
+ << ReuseShuffleCost + VecCost - ScalarCost << "\n";
}
#endif
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
+ std::optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<int> ReuseShuffleIndices = std::nullopt,
@@ -2791,8 +2941,14 @@ private:
return ScalarToTreeEntry.lookup(V);
}
+ /// Checks if the specified list of the instructions/values can be vectorized
+ /// and fills required data before actual scheduling of the instructions.
+ TreeEntry::EntryState getScalarsVectorizationState(
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
+
/// Maps a specific scalar to its tree entry.
- SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+ SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -2808,6 +2964,15 @@ private:
/// pre-gather them before.
DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
+ /// List of gather nodes, depending on other gather/vector nodes, which should
+ /// be emitted after the vector instruction emission process to correctly
+ /// handle order of the vector instructions and shuffles.
+ SetVector<const TreeEntry *> PostponedGathers;
+
+ using ValueToGatherNodesMap =
+ DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
+ ValueToGatherNodesMap ValueToGatherNodes;
+
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
@@ -3235,7 +3400,6 @@ private:
<< "SLP: gets ready (ctl): " << *DepBundle << "\n");
}
}
-
}
}
@@ -3579,7 +3743,7 @@ static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
SmallVector<int> Prev(Reuses.begin(), Reuses.end());
Prev.swap(Reuses);
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Reuses[Mask[I]] = Prev[I];
}
@@ -3603,7 +3767,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
}
Order.assign(Mask.size(), Mask.size());
for (unsigned I = 0, E = Mask.size(); I < E; ++I)
- if (MaskOrder[I] != UndefMaskElem)
+ if (MaskOrder[I] != PoisonMaskElem)
Order[MaskOrder[I]] = I;
fixupOrderingIndices(Order);
}
@@ -3653,10 +3817,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return false;
return true;
};
- if (IsIdentityOrder(CurrentOrder)) {
- CurrentOrder.clear();
- return CurrentOrder;
- }
+ if (IsIdentityOrder(CurrentOrder))
+ return OrdersType();
auto *It = CurrentOrder.begin();
for (unsigned I = 0; I < NumScalars;) {
if (UsedPositions.test(I)) {
@@ -3669,7 +3831,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
}
++It;
}
- return CurrentOrder;
+ return std::move(CurrentOrder);
}
return std::nullopt;
}
@@ -3779,9 +3941,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
return LoadsState::Gather;
}
-bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices) {
+static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
assert(llvm::all_of(
VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
"Expected list of pointer operands.");
@@ -3825,7 +3987,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
return std::get<1>(X) < std::get<1>(Y);
});
int InitialOffset = std::get<1>(Vec[0]);
- AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
});
}
@@ -3862,7 +4024,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
BoUpSLP::OrdersType Order;
if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
- return Order;
+ return std::move(Order);
return std::nullopt;
}
@@ -3888,31 +4050,35 @@ static bool areTwoInsertFromSameBuildVector(
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
+ SmallSet<int, 8> ReusedIdx;
+ bool IsReusedIdx = false;
do {
- if (IE2 == VU)
+ if (IE2 == VU && !IE1)
return VU->hasOneUse();
- if (IE1 == V)
+ if (IE1 == V && !IE2)
return V->hasOneUse();
- if (IE1) {
- if ((IE1 != VU && !IE1->hasOneUse()) ||
- getInsertIndex(IE1).value_or(*Idx2) == *Idx2)
+ if (IE1 && IE1 != V) {
+ IsReusedIdx |=
+ !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
+ if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
IE1 = nullptr;
else
IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
}
- if (IE2) {
- if ((IE2 != V && !IE2->hasOneUse()) ||
- getInsertIndex(IE2).value_or(*Idx1) == *Idx1)
+ if (IE2 && IE2 != VU) {
+ IsReusedIdx |=
+ !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
+ if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
IE2 = nullptr;
else
IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
}
- } while (IE1 || IE2);
+ } while (!IsReusedIdx && (IE1 || IE2));
return false;
}
-std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
- bool TopToBottom) {
+std::optional<BoUpSLP::OrdersType>
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -3936,14 +4102,14 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
return Idx && *Idx < Sz;
})) {
- SmallVector<int> ReorderMask(Sz, UndefMaskElem);
+ SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
if (TE.ReorderIndices.empty())
std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
else
inversePermutation(TE.ReorderIndices, ReorderMask);
for (unsigned I = 0; I < VF; ++I) {
int &Idx = ReusedMask[I];
- if (Idx == UndefMaskElem)
+ if (Idx == PoisonMaskElem)
continue;
Value *V = TE.Scalars[ReorderMask[Idx]];
std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
@@ -3958,7 +4124,7 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
for (unsigned K = 0; K < VF; K += Sz) {
OrdersType CurrentOrder(TE.ReorderIndices);
SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
- if (SubMask.front() == UndefMaskElem)
+ if (SubMask.front() == PoisonMaskElem)
std::iota(SubMask.begin(), SubMask.end(), 0);
reorderOrder(CurrentOrder, SubMask);
transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
@@ -3966,8 +4132,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
}
if (all_of(enumerate(ResOrder),
[](const auto &Data) { return Data.index() == Data.value(); }))
- return {}; // Use identity order.
- return ResOrder;
+ return std::nullopt; // No need to reorder.
+ return std::move(ResOrder);
}
if (TE.State == TreeEntry::Vectorize &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
@@ -3976,6 +4142,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
return TE.ReorderIndices;
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
+ if (V1 == V2)
+ return false;
if (!V1->hasOneUse() || !V2->hasOneUse())
return false;
auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
@@ -4023,8 +4191,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
ResOrder[Id] = PhiToId[Phis[Id]];
if (IsIdentityOrder(ResOrder))
- return {};
- return ResOrder;
+ return std::nullopt; // No need to reorder.
+ return std::move(ResOrder);
}
if (TE.State == TreeEntry::NeedToGather) {
// TODO: add analysis of other gather nodes with extractelement
@@ -4050,7 +4218,42 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
if (Reuse || !CurrentOrder.empty()) {
if (!CurrentOrder.empty())
fixupOrderingIndices(CurrentOrder);
- return CurrentOrder;
+ return std::move(CurrentOrder);
+ }
+ }
+ // If the gather node is <undef, v, .., poison> and
+ // insertelement poison, v, 0 [+ permute]
+ // is cheaper than
+ // insertelement poison, v, n - try to reorder.
+ // If rotating the whole graph, exclude the permute cost, the whole graph
+ // might be transformed.
+ int Sz = TE.Scalars.size();
+ if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
+ count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
+ const auto *It =
+ find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
+ if (It == TE.Scalars.begin())
+ return OrdersType();
+ auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
+ if (It != TE.Scalars.end()) {
+ OrdersType Order(Sz, Sz);
+ unsigned Idx = std::distance(TE.Scalars.begin(), It);
+ Order[Idx] = 0;
+ fixupOrderingIndices(Order);
+ SmallVector<int> Mask;
+ inversePermutation(Order, Mask);
+ InstructionCost PermuteCost =
+ TopToBottom
+ ? 0
+ : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
+ InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
+ PoisonValue::get(Ty), *It);
+ InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
+ PoisonValue::get(Ty), *It);
+ if (InsertFirstCost + PermuteCost < InsertIdxCost)
+ return std::move(Order);
}
}
if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
@@ -4260,7 +4463,7 @@ void BoUpSLP::reorderTopToBottom() {
unsigned E = Order.size();
OrdersType CurrentOrder(E, E);
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
@@ -4285,10 +4488,10 @@ void BoUpSLP::reorderTopToBottom() {
continue;
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
unsigned E = BestOrder.size();
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
- return I < E ? static_cast<int>(I) : UndefMaskElem;
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
// Do an actual reordering, if profitable.
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
@@ -4384,7 +4587,7 @@ bool BoUpSLP::canReorderOperands(
}
return false;
}) > 1 &&
- !all_of(UserTE->getOperand(I), isConstant))
+ !allConstant(UserTE->getOperand(I)))
return false;
if (Gather)
GatherOps.push_back(Gather);
@@ -4499,7 +4702,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
unsigned E = Order.size();
OrdersType CurrentOrder(E, E);
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
@@ -4578,10 +4781,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
VisitedOps.clear();
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
unsigned E = BestOrder.size();
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
- return I < E ? static_cast<int>(I) : UndefMaskElem;
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
TreeEntry *TE = Op.second;
@@ -4779,7 +4982,7 @@ bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
// Check if the stores are consecutive by checking if their difference is 1.
for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
- if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
return false;
// Calculate the shuffle indices according to their offset against the sorted
@@ -4976,6 +5179,309 @@ static bool isAlternateInstruction(const Instruction *I,
const Instruction *AltOp,
const TargetLibraryInfo &TLI);
+BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
+ assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
+
+ unsigned ShuffleOrOp =
+ S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
+ auto *VL0 = cast<Instruction>(S.OpValue);
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ // Check for terminator values (e.g. invoke).
+ for (Value *V : VL)
+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+ Instruction *Term = dyn_cast<Instruction>(Incoming);
+ if (Term && Term->isTerminator()) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: Need to swizzle PHINodes (terminator use).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+ if (Reuse || !CurrentOrder.empty())
+ return TreeEntry::Vectorize;
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ return TreeEntry::NeedToGather;
+ }
+ case Instruction::InsertElement: {
+ // Check that we have a buildvector and not a shuffle of 2 or more
+ // different vectors.
+ ValueSet SourceVectors;
+ for (Value *V : VL) {
+ SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+ assert(getInsertIndex(V) != std::nullopt &&
+ "Non-constant or undef index?");
+ }
+
+ if (count_if(VL, [&SourceVectors](Value *V) {
+ return !SourceVectors.contains(V);
+ }) >= 2) {
+ // Found 2nd source vector - cancel.
+ LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+ "different source vectors.\n");
+ return TreeEntry::NeedToGather;
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder,
+ PointerOps)) {
+ case LoadsState::Vectorize:
+ return TreeEntry::Vectorize;
+ case LoadsState::ScatterVectorize:
+ return TreeEntry::ScatterVectorize;
+ case LoadsState::Gather:
+#ifndef NDEBUG
+ Type *ScalarTy = VL0->getType();
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ else if (any_of(VL,
+ [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ else
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+#endif // NDEBUG
+ return TreeEntry::NeedToGather;
+ }
+ llvm_unreachable("Unexpected state of loads");
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: Gathering casts with different src types.\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
+ Type *ComparedTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ CmpInst *Cmp = cast<CmpInst>(V);
+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
+ Cmp->getOperand(0)->getType() != ComparedTy) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Select:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return TreeEntry::Vectorize;
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (Value *V : VL) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ if (I->getNumOperands() != 2) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
+ for (Value *V : VL) {
+ auto *GEP = dyn_cast<GEPOperator>(V);
+ if (!GEP)
+ continue;
+ Type *CurTy = GEP->getSourceElementType();
+ if (Ty0 != CurTy) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ Type *Ty1 = VL0->getOperand(1)->getType();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ auto *Op = I->getOperand(1);
+ if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
+ (Op->getType() != Ty1 &&
+ ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace())))) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Store: {
+ // Check if the stores are consecutive or if we need to swizzle them.
+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+ // Avoid types that are padded when being allocated as scalars, while
+ // being packed together in a vector (such as i1).
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
+ return TreeEntry::NeedToGather;
+ }
+ // Make sure all stores in the bundle are simple - we can't vectorize
+ // atomic or volatile stores.
+ for (Value *V : VL) {
+ auto *SI = cast<StoreInst>(V);
+ if (!SI->isSimple()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
+ return TreeEntry::NeedToGather;
+ }
+ PointerOps.push_back(SI->getPointerOperand());
+ }
+
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ std::optional<int> Dist =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
+ // Check that the sorted pointer operands are consecutive.
+ if (static_cast<unsigned>(*Dist) == VL.size() - 1)
+ return TreeEntry::Vectorize;
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ return TreeEntry::NeedToGather;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ VFShape Shape = VFShape::get(
+ *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!VecFunc && !isTriviallyVectorizable(ID)) {
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return TreeEntry::NeedToGather;
+ }
+ Function *F = CI->getCalledFunction();
+ unsigned NumArgs = CI->arg_size();
+ SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
+ for (unsigned J = 0; J != NumArgs; ++J)
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
+ ScalarArgs[J] = CI->getArgOperand(J);
+ for (Value *V : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(V);
+ if (!CI2 || CI2->getCalledFunction() != F ||
+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ (VecFunc &&
+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
+ << "\n");
+ return TreeEntry::NeedToGather;
+ }
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized.
+ for (unsigned J = 0; J != NumArgs; ++J) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
+ Value *A1J = CI2->getArgOperand(J);
+ if (ScalarArgs[J] != A1J) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ }
+ // Verify that the bundle operands are identical between the two calls.
+ if (CI->hasOperandBundles() &&
+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+ CI->op_begin() + CI->getBundleOperandsEndIndex(),
+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
+ << "!=" << *V << '\n');
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!S.isAltShuffle()) {
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return TreeEntry::NeedToGather;
+ }
+ return TreeEntry::Vectorize;
+ }
+ default:
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return TreeEntry::NeedToGather;
+ }
+}
+
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -4990,7 +5496,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (isConstant(V)) {
ReuseShuffleIndicies.emplace_back(
- isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
+ isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
UniqueValues.emplace_back(V);
continue;
}
@@ -5010,7 +5516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<UndefValue>(V) ||
!isConstant(V);
})) ||
- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
@@ -5257,6 +5763,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!TryToFindDuplicates(S))
return;
+ // Perform specific checks for each particular instruction kind.
+ OrdersType CurrentOrder;
+ SmallVector<Value *> PointerOps;
+ TreeEntry::EntryState State = getScalarsVectorizationState(
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+ if (State == TreeEntry::NeedToGather) {
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
@@ -5285,20 +5802,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
- // Check for terminator values (e.g. invoke).
- for (Value *V : VL)
- for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
- Instruction *Term = dyn_cast<Instruction>(Incoming);
- if (Term && Term->isTerminator()) {
- LLVM_DEBUG(dbgs()
- << "SLP: Need to swizzle PHINodes (terminator use).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
TreeEntry *TE =
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
@@ -5326,9 +5829,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- OrdersType CurrentOrder;
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
- if (Reuse) {
+ if (CurrentOrder.empty()) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -5339,55 +5840,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
VectorizableTree.back()->setOperand(0, Op0);
return;
}
- if (!CurrentOrder.empty()) {
- LLVM_DEBUG({
- dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
- "with order";
- for (unsigned Idx : CurrentOrder)
- dbgs() << " " << Idx;
- dbgs() << "\n";
- });
- fixupOrderingIndices(CurrentOrder);
- // Insert new order with initial value 0, if it does not exist,
- // otherwise return the iterator to the existing one.
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, CurrentOrder);
- // This is a special case, as it does not gather, but at the same time
- // we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
- return;
- }
- LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- BS.cancelScheduling(VL, VL0);
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ fixupOrderingIndices(CurrentOrder);
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, CurrentOrder);
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
return;
}
case Instruction::InsertElement: {
assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
- // Check that we have a buildvector and not a shuffle of 2 or more
- // different vectors.
- ValueSet SourceVectors;
- for (Value *V : VL) {
- SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
- assert(getInsertIndex(V) != std::nullopt &&
- "Non-constant or undef index?");
- }
-
- if (count_if(VL, [&SourceVectors](Value *V) {
- return !SourceVectors.contains(V);
- }) >= 2) {
- // Found 2nd source vector - cancel.
- LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
- "different source vectors.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
- BS.cancelScheduling(VL, VL0);
- return;
- }
-
auto OrdCompare = [](const std::pair<int, int> &P1,
const std::pair<int, int> &P2) {
return P1.first > P2.first;
@@ -5430,12 +5904,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI,
- CurrentOrder, PointerOps)) {
- case LoadsState::Vectorize:
+ switch (State) {
+ case TreeEntry::Vectorize:
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -5450,7 +5921,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
- case LoadsState::ScatterVectorize:
+ case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies);
@@ -5458,23 +5929,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
break;
- case LoadsState::Gather:
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
-#ifndef NDEBUG
- Type *ScalarTy = VL0->getType();
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy))
- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
- else if (any_of(VL, [](Value *V) {
- return !cast<LoadInst>(V)->isSimple();
- }))
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
- else
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-#endif // NDEBUG
- break;
+ case TreeEntry::NeedToGather:
+ llvm_unreachable("Unexpected loads state.");
}
return;
}
@@ -5490,18 +5946,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- Type *SrcTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
- if (Ty != SrcTy || !isValidElementType(Ty)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering casts with different src types.\n");
- return;
- }
- }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
@@ -5521,21 +5965,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
- Type *ComparedTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- CmpInst *Cmp = cast<CmpInst>(V);
- if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
- Cmp->getOperand(0)->getType() != ComparedTy) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering cmp with different predicate.\n");
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
@@ -5544,7 +5973,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
- assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+ assert(P0 == CmpInst::getSwappedPredicate(P0) &&
+ "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
@@ -5612,60 +6042,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::GetElementPtr: {
- // We don't combine GEPs with complicated (nested) indexing.
- for (Value *V : VL) {
- auto *I = dyn_cast<GetElementPtrInst>(V);
- if (!I)
- continue;
- if (I->getNumOperands() != 2) {
- LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We can't combine several GEPs into one vector if they operate on
- // different types.
- Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
- for (Value *V : VL) {
- auto *GEP = dyn_cast<GEPOperator>(V);
- if (!GEP)
- continue;
- Type *CurTy = GEP->getSourceElementType();
- if (Ty0 != CurTy) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (different types).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We don't combine GEPs with non-constant indexes.
- Type *Ty1 = VL0->getOperand(1)->getType();
- for (Value *V : VL) {
- auto *I = dyn_cast<GetElementPtrInst>(V);
- if (!I)
- continue;
- auto *Op = I->getOperand(1);
- if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
- (Op->getType() != Ty1 &&
- ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
- Op->getType()->getScalarSizeInBits() >
- DL->getIndexSizeInBits(
- V->getType()->getPointerAddressSpace())))) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (non-constant indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
@@ -5722,78 +6098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
- llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
- // Avoid types that are padded when being allocated as scalars, while
- // being packed together in a vector (such as i1).
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
- return;
- }
- // Make sure all stores in the bundle are simple - we can't vectorize
- // atomic or volatile stores.
- SmallVector<Value *, 4> PointerOps(VL.size());
ValueList Operands(VL.size());
- auto POIter = PointerOps.begin();
- auto OIter = Operands.begin();
+ auto *OIter = Operands.begin();
for (Value *V : VL) {
auto *SI = cast<StoreInst>(V);
- if (!SI->isSimple()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
- return;
- }
- *POIter = SI->getPointerOperand();
*OIter = SI->getValueOperand();
- ++POIter;
++OIter;
}
-
- OrdersType CurrentOrder;
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
- Value *Ptr0;
- Value *PtrN;
- if (CurrentOrder.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[CurrentOrder.front()];
- PtrN = PointerOps[CurrentOrder.back()];
- }
- std::optional<int> Dist =
- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
- // Check that the sorted pointer operands are consecutive.
- if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
- if (CurrentOrder.empty()) {
- // Original stores are consecutive and does not require reordering.
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
- } else {
- fixupOrderingIndices(CurrentOrder);
- TreeEntry *TE =
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, CurrentOrder);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
- }
- return;
- }
+ // Check that the sorted pointer operands are consecutive.
+ if (CurrentOrder.empty()) {
+ // Original stores are consecutive and does not require reordering.
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ } else {
+ fixupOrderingIndices(CurrentOrder);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, CurrentOrder);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
}
-
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
case Instruction::Call: {
@@ -5802,68 +6129,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- VFShape Shape = VFShape::get(
- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
- false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-
- if (!VecFunc && !isTriviallyVectorizable(ID)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
- return;
- }
- Function *F = CI->getCalledFunction();
- unsigned NumArgs = CI->arg_size();
- SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
- for (unsigned j = 0; j != NumArgs; ++j)
- if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
- ScalarArgs[j] = CI->getArgOperand(j);
- for (Value *V : VL) {
- CallInst *CI2 = dyn_cast<CallInst>(V);
- if (!CI2 || CI2->getCalledFunction() != F ||
- getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
- (VecFunc &&
- VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
- !CI->hasIdenticalOperandBundleSchema(*CI2)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
- << "\n");
- return;
- }
- // Some intrinsics have scalar arguments and should be same in order for
- // them to be vectorized.
- for (unsigned j = 0; j != NumArgs; ++j) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
- Value *A1J = CI2->getArgOperand(j);
- if (ScalarArgs[j] != A1J) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument " << ScalarArgs[j] << "!=" << A1J
- << "\n");
- return;
- }
- }
- }
- // Verify that the bundle operands are identical between the two calls.
- if (CI->hasOperandBundles() &&
- !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
- CI->op_begin() + CI->getBundleOperandsEndIndex(),
- CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
- << *CI << "!=" << *V << '\n');
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
@@ -5883,15 +6148,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::ShuffleVector: {
- // If this is not an alternate sequence of opcode like add-sub
- // then do not vectorize this instruction.
- if (!S.isAltShuffle()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
- return;
- }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
@@ -5949,19 +6205,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
default:
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
- return;
+ break;
}
+ llvm_unreachable("Unexpected vectorization of the instructions.");
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
- while (isa<StructType, ArrayType, VectorType>(EltTy)) {
+ while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
@@ -5982,7 +6235,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
- if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
+ VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
return N;
}
@@ -6111,68 +6365,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
-/// Compute the cost of creating a vector of type \p VecTy containing the
-/// extracted values from \p VL.
-static InstructionCost
-computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
- TargetTransformInfo::ShuffleKind ShuffleKind,
- ArrayRef<int> Mask, TargetTransformInfo &TTI) {
- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
- VecTy->getNumElements() < NumOfParts)
- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
- bool AllConsecutive = true;
- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
- unsigned Idx = -1;
- InstructionCost Cost = 0;
-
- // Process extracts in blocks of EltsPerVector to check if the source vector
- // operand can be re-used directly. If not, add the cost of creating a shuffle
- // to extract the values into a vector register.
- SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
- for (auto *V : VL) {
- ++Idx;
-
- // Reached the start of a new vector registers.
- if (Idx % EltsPerVector == 0) {
- RegMask.assign(EltsPerVector, UndefMaskElem);
- AllConsecutive = true;
- continue;
- }
-
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
- continue;
-
- // Check all extracts for a vector register on the target directly
- // extract values in order.
- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
- }
-
- if (AllConsecutive)
- continue;
-
- // Skip all indices, except for the last index per vector block.
- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
- continue;
-
- // If we have a series of extracts which are not consecutive and hence
- // cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the vector with EltsPerVector elements.
- Cost += TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
- }
- return Cost;
-}
-
/// Build shuffle mask for shuffle graph entries and lists of main and alternate
/// operations operands.
static void
@@ -6183,7 +6375,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) {
unsigned Sz = VL.size();
- Mask.assign(Sz, UndefMaskElem);
+ Mask.assign(Sz, PoisonMaskElem);
SmallVector<int> OrderMask;
if (!ReorderIndices.empty())
inversePermutation(ReorderIndices, OrderMask);
@@ -6203,9 +6395,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
}
}
if (!ReusesIndices.empty()) {
- SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);
transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
- return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
+ return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
});
Mask.swap(NewMask);
}
@@ -6325,13 +6517,13 @@ protected:
static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
ArrayRef<int> ExtMask) {
unsigned VF = Mask.size();
- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
- if (ExtMask[I] == UndefMaskElem)
+ if (ExtMask[I] == PoisonMaskElem)
continue;
int MaskedIdx = Mask[ExtMask[I] % VF];
NewMask[I] =
- MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF;
+ MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
}
Mask.swap(NewMask);
}
@@ -6418,11 +6610,12 @@ protected:
if (auto *SVOpTy =
dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
LocalVF = SVOpTy->getNumElements();
- SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);
+ SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(Mask)) {
- if (I == UndefMaskElem)
- continue;
- ExtMask[Idx] = SV->getMaskValue(I);
+ if (I == PoisonMaskElem ||
+ static_cast<unsigned>(I) >= SV->getShuffleMask().size())
+ continue;
+ ExtMask[Idx] = SV->getMaskValue(I);
}
bool IsOp1Undef =
isUndefVector(SV->getOperand(0),
@@ -6435,11 +6628,11 @@ protected:
if (!IsOp1Undef && !IsOp2Undef) {
// Update mask and mark undef elems.
for (int &I : Mask) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
- UndefMaskElem)
- I = UndefMaskElem;
+ PoisonMaskElem)
+ I = PoisonMaskElem;
}
break;
}
@@ -6453,15 +6646,16 @@ protected:
Op = SV->getOperand(1);
}
if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
- !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) {
+ !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
+ ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
if (IdentityOp) {
V = IdentityOp;
assert(Mask.size() == IdentityMask.size() &&
"Expected masks of same sizes.");
// Clear known poison elements.
for (auto [I, Idx] : enumerate(Mask))
- if (Idx == UndefMaskElem)
- IdentityMask[I] = UndefMaskElem;
+ if (Idx == PoisonMaskElem)
+ IdentityMask[I] = PoisonMaskElem;
Mask.swap(IdentityMask);
auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
return SinglePermute &&
@@ -6481,10 +6675,12 @@ protected:
/// Smart shuffle instruction emission, walks through shuffles trees and
/// tries to find the best matching vector for the actual shuffle
/// instruction.
- template <typename ShuffleBuilderTy>
- static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
- ShuffleBuilderTy &Builder) {
+ template <typename T, typename ShuffleBuilderTy>
+ static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+ ShuffleBuilderTy &Builder) {
assert(V1 && "Expected at least one vector value.");
+ if (V2)
+ Builder.resizeToMatch(V1, V2);
int VF = Mask.size();
if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
VF = FTy->getNumElements();
@@ -6495,8 +6691,8 @@ protected:
Value *Op2 = V2;
int VF =
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (Mask[I] < VF)
CombinedMask1[I] = Mask[I];
@@ -6514,9 +6710,9 @@ protected:
// again.
if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
- SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask1)) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
ExtMask1[Idx] = SV1->getMaskValue(I);
}
@@ -6524,9 +6720,9 @@ protected:
cast<FixedVectorType>(SV1->getOperand(1)->getType())
->getNumElements(),
ExtMask1, UseMask::SecondArg);
- SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem);
+ SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask2)) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
ExtMask2[Idx] = SV2->getMaskValue(I);
}
@@ -6566,64 +6762,360 @@ protected:
->getElementCount()
.getKnownMinValue());
for (int I = 0, E = Mask.size(); I < E; ++I) {
- if (CombinedMask2[I] != UndefMaskElem) {
- assert(CombinedMask1[I] == UndefMaskElem &&
+ if (CombinedMask2[I] != PoisonMaskElem) {
+ assert(CombinedMask1[I] == PoisonMaskElem &&
"Expected undefined mask element");
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
}
}
+ const int Limit = CombinedMask1.size() * 2;
+ if (Op1 == Op2 && Limit == 2 * VF &&
+ all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
+ (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+ isa<ShuffleVectorInst>(Op1) &&
+ cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
+ ArrayRef(CombinedMask1))))
+ return Builder.createIdentity(Op1);
return Builder.createShuffleVector(
Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
CombinedMask1);
}
if (isa<PoisonValue>(V1))
- return PoisonValue::get(FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+ return Builder.createPoison(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size());
SmallVector<int> NewMask(Mask.begin(), Mask.end());
bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
assert(V1 && "Expected non-null value after looking through shuffles.");
if (!IsIdentity)
return Builder.createShuffleVector(V1, NewMask);
- return V1;
+ return Builder.createIdentity(V1);
}
};
} // namespace
-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
- ArrayRef<Value *> VectorizedVals) {
- ArrayRef<Value *> VL = E->Scalars;
+/// Merges shuffle masks and emits final shuffle instruction, if required. It
+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
+/// when the actual shuffle instruction is generated only if this is actually
+/// required. Otherwise, the shuffle instruction emission is delayed till the
+/// end of the process, to reduce the number of emitted instructions and further
+/// analysis/transformations.
+class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
+ bool IsFinalized = false;
+ SmallVector<int> CommonMask;
+ SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
+ const TargetTransformInfo &TTI;
+ InstructionCost Cost = 0;
+ ArrayRef<Value *> VectorizedVals;
+ BoUpSLP &R;
+ SmallPtrSetImpl<Value *> &CheckedExtracts;
+ constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
+ if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
+ return TTI::TCC_Free;
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+ InstructionCost GatherCost = 0;
+ SmallVector<Value *> Gathers(VL.begin(), VL.end());
+ // Improve gather cost for gather of loads, if we can group some of the
+ // loads into vector loads.
+ InstructionsState S = getSameOpcode(VL, *R.TLI);
+ if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
+ !S.isAltShuffle() &&
+ !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
+ !isSplat(Gathers)) {
+ BoUpSLP::ValueSet VectorizedLoads;
+ unsigned StartIdx = 0;
+ unsigned VF = VL.size() / 2;
+ unsigned VectorizedCnt = 0;
+ unsigned ScatterVectorizeCnt = 0;
+ const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
+ for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+ Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (!VectorizedLoads.count(Slice.front()) &&
+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+ SmallVector<Value *> PointerOps;
+ OrdersType CurrentOrder;
+ LoadsState LS =
+ canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
+ *R.LI, *R.TLI, CurrentOrder, PointerOps);
+ switch (LS) {
+ case LoadsState::Vectorize:
+ case LoadsState::ScatterVectorize:
+ // Mark the vectorized loads so that we don't vectorize them
+ // again.
+ if (LS == LoadsState::Vectorize)
+ ++VectorizedCnt;
+ else
+ ++ScatterVectorizeCnt;
+ VectorizedLoads.insert(Slice.begin(), Slice.end());
+ // If we vectorized initial block, no need to try to vectorize
+ // it again.
+ if (Cnt == StartIdx)
+ StartIdx += VF;
+ break;
+ case LoadsState::Gather:
+ break;
+ }
+ }
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= VL.size())
+ break;
+ // Found vectorizable parts - exit.
+ if (!VectorizedLoads.empty())
+ break;
+ }
+ if (!VectorizedLoads.empty()) {
+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
+ bool NeedInsertSubvectorAnalysis =
+ !NumParts || (VL.size() / VF) > NumParts;
+ // Get the cost for gathered loads.
+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+ if (VectorizedLoads.contains(VL[I]))
+ continue;
+ GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
+ }
+ // Exclude potentially vectorized loads from list of gathered
+ // scalars.
+ auto *LI = cast<LoadInst>(S.MainOp);
+ Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
+ // The cost for vectorized loads.
+ InstructionCost ScalarsCost = 0;
+ for (Value *V : VectorizedLoads) {
+ auto *LI = cast<LoadInst>(V);
+ ScalarsCost +=
+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
+ LI->getAlign(), LI->getPointerAddressSpace(),
+ CostKind, TTI::OperandValueInfo(), LI);
+ }
+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ VectorizedCnt *
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
+ GatherCost += ScatterVectorizeCnt *
+ TTI.getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI->getPointerOperand(),
+ /*VariableMask=*/false, Alignment, CostKind, LI);
+ if (NeedInsertSubvectorAnalysis) {
+ // Add the cost for the subvectors insert.
+ for (int I = VF, E = VL.size(); I < E; I += VF)
+ GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+ std::nullopt, CostKind, I, LoadTy);
+ }
+ GatherCost -= ScalarsCost;
+ }
+ } else if (!Root && isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as
+ // the broadcast.
+ const auto *It =
+ find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
+ assert(It != VL.end() && "Expected at least one non-undef value.");
+ // Add broadcast for non-identity shuffle only.
+ bool NeedShuffle =
+ count(VL, *It) > 1 &&
+ (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
+ InstructionCost InsertCost = TTI.getVectorInstrCost(
+ Instruction::InsertElement, VecTy, CostKind,
+ NeedShuffle ? 0 : std::distance(VL.begin(), It),
+ PoisonValue::get(VecTy), *It);
+ return InsertCost +
+ (NeedShuffle ? TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/*It)
+ : TTI::TCC_Free);
+ }
+ return GatherCost +
+ (all_of(Gathers, UndefValue::classof)
+ ? TTI::TCC_Free
+ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+ };
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
- ScalarTy = IE->getOperand(1)->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ /// Compute the cost of creating a vector of type \p VecTy containing the
+ /// extracted values from \p VL.
+ InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+ TTI::ShuffleKind ShuffleKind) {
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+ unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
- // If we have computed a smaller type for the expression, update VecTy so
- // that the costs will be accurate.
- if (MinBWs.count(VL[0]))
- VecTy = FixedVectorType::get(
- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
- unsigned EntryVF = E->getVectorFactor();
- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+ if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||
+ !NumOfParts || VecTy->getNumElements() < NumOfParts)
+ return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- // FIXME: it tries to fix a problem with MSVC buildbots.
- TargetTransformInfo *TTI = this->TTI;
- auto AdjustExtractsCost = [=](InstructionCost &Cost) {
+ bool AllConsecutive = true;
+ unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
+ unsigned Idx = -1;
+ InstructionCost Cost = 0;
+
+ // Process extracts in blocks of EltsPerVector to check if the source vector
+ // operand can be re-used directly. If not, add the cost of creating a
+ // shuffle to extract the values into a vector register.
+ SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
+ for (auto *V : VL) {
+ ++Idx;
+
+ // Reached the start of a new vector registers.
+ if (Idx % EltsPerVector == 0) {
+ RegMask.assign(EltsPerVector, PoisonMaskElem);
+ AllConsecutive = true;
+ continue;
+ }
+
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+ continue;
+
+ // Check all extracts for a vector register on the target directly
+ // extract values in order.
+ unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {
+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
+ }
+
+ if (AllConsecutive)
+ continue;
+
+ // Skip all indices, except for the last index per vector block.
+ if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
+ continue;
+
+ // If we have a series of extracts which are not consecutive and hence
+ // cannot re-use the source vector register directly, compute the shuffle
+ // cost to extract the vector with EltsPerVector elements.
+ Cost += TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc,
+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector),
+ RegMask);
+ }
+ return Cost;
+ }
+
+ class ShuffleCostBuilder {
+ const TargetTransformInfo &TTI;
+
+ static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
+ int Limit = 2 * VF;
+ return Mask.empty() ||
+ (VF == Mask.size() &&
+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask));
+ }
+
+ public:
+ ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
+ ~ShuffleCostBuilder() = default;
+ InstructionCost createShuffleVector(Value *V1, Value *,
+ ArrayRef<int> Mask) const {
+ // Empty mask or identity mask are free.
+ unsigned VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ if (isEmptyOrIdentity(Mask, VF))
+ return TTI::TCC_Free;
+ return TTI.getShuffleCost(
+ TTI::SK_PermuteTwoSrc,
+ FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+ Mask);
+ }
+ InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
+ // Empty mask or identity mask are free.
+ if (isEmptyOrIdentity(Mask, Mask.size()))
+ return TTI::TCC_Free;
+ return TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc,
+ FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+ Mask);
+ }
+ InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
+ InstructionCost createPoison(Type *Ty, unsigned VF) const {
+ return TTI::TCC_Free;
+ }
+ void resizeToMatch(Value *&, Value *&) const {}
+ };
+
+ /// Smart shuffle instruction emission, walks through shuffles trees and
+ /// tries to find the best matching vector for the actual shuffle
+ /// instruction.
+ InstructionCost
+ createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
+ const PointerUnion<Value *, const TreeEntry *> &P2,
+ ArrayRef<int> Mask) {
+ ShuffleCostBuilder Builder(TTI);
+ Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
+ unsigned CommonVF = 0;
+ if (!V1) {
+ const TreeEntry *E = P1.get<const TreeEntry *>();
+ unsigned VF = E->getVectorFactor();
+ if (V2) {
+ unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+ if (V2VF != VF && V2VF == E->Scalars.size())
+ VF = E->Scalars.size();
+ } else if (!P2.isNull()) {
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ if (E->Scalars.size() == E2->Scalars.size())
+ CommonVF = VF = E->Scalars.size();
+ } else {
+ // P2 is empty, check that we have same node + reshuffle (if any).
+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
+ VF = E->Scalars.size();
+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());
+ ::addMask(CommonMask, E->getCommonMask());
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(
+ V1, nullptr, CommonMask, Builder);
+ }
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ }
+ if (!V2 && !P2.isNull()) {
+ const TreeEntry *E = P2.get<const TreeEntry *>();
+ unsigned VF = E->getVectorFactor();
+ unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ if (!CommonVF && V1VF == E->Scalars.size())
+ CommonVF = E->Scalars.size();
+ if (CommonVF)
+ VF = CommonVF;
+ V2 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ }
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
+ Builder);
+ }
+
+public:
+ ShuffleCostEstimator(TargetTransformInfo &TTI,
+ ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
+ SmallPtrSetImpl<Value *> &CheckedExtracts)
+ : TTI(TTI), VectorizedVals(VectorizedVals), R(R),
+ CheckedExtracts(CheckedExtracts) {}
+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
+ TTI::ShuffleKind ShuffleKind) {
+ if (Mask.empty())
+ return nullptr;
+ Value *VecBase = nullptr;
+ ArrayRef<Value *> VL = E->Scalars;
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
// If the resulting type is scalarized, do not adjust the cost.
- unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
+ unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
if (VecNumParts == VecTy->getNumElements())
- return;
+ return nullptr;
DenseMap<Value *, int> ExtractVectorsTys;
- SmallPtrSet<Value *, 4> CheckedExtracts;
- for (auto *V : VL) {
- if (isa<UndefValue>(V))
+ for (auto [I, V] : enumerate(VL)) {
+ // Ignore non-extractelement scalars.
+ if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
continue;
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
@@ -6631,17 +7123,18 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// vectorized tree.
// Also, avoid adjusting the cost for extractelements with multiple uses
// in different graph entries.
- const TreeEntry *VE = getTreeEntry(V);
+ const TreeEntry *VE = R.getTreeEntry(V);
if (!CheckedExtracts.insert(V).second ||
- !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
(VE && VE != E))
continue;
auto *EE = cast<ExtractElementInst>(V);
+ VecBase = EE->getVectorOperand();
std::optional<unsigned> EEIdx = getExtractIndex(EE);
if (!EEIdx)
continue;
unsigned Idx = *EEIdx;
- if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {
+ if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
auto It =
ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
It->getSecond() = std::min<int>(It->second, Idx);
@@ -6654,18 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
})) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
- Cost -=
- TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
- EE->getVectorOperandType(), Idx);
+ Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
// Add back the cost of s|zext which is subtracted separately.
- Cost += TTI->getCastInstrCost(
+ Cost += TTI.getCastInstrCost(
Ext->getOpcode(), Ext->getType(), EE->getType(),
TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
- Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
- Idx);
+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+ Idx);
}
// Add a cost for subvector extracts/inserts if required.
for (const auto &Data : ExtractVectorsTys) {
@@ -6673,34 +7165,148 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
unsigned NumElts = VecTy->getNumElements();
if (Data.second % NumElts == 0)
continue;
- if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
+ if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
unsigned Idx = (Data.second / NumElts) * NumElts;
unsigned EENumElts = EEVTy->getNumElements();
+ if (Idx % NumElts == 0)
+ continue;
if (Idx + NumElts <= EENumElts) {
- Cost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, VecTy);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, VecTy);
} else {
// Need to round up the subvector type vectorization factor to avoid a
// crash in cost model functions. Make SubVT so that Idx + VF of SubVT
// <= EENumElts.
auto *SubVT =
FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
- Cost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, SubVT);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, SubVT);
}
} else {
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
- VecTy, std::nullopt, CostKind, 0, EEVTy);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+ VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
- };
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ // Found the bunch of extractelement instructions that must be gathered
+ // into a vector and can be represented as a permutation elements in a
+ // single input vector or of 2 input vectors.
+ Cost += computeExtractCost(VL, Mask, ShuffleKind);
+ return VecBase;
+ }
+ void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign({E1, E2});
+ }
+ void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, E1);
+ }
+ /// Adds another one input vector and the mask for the shuffling.
+ void add(Value *V1, ArrayRef<int> Mask) {
+ assert(CommonMask.empty() && InVectors.empty() &&
+ "Expected empty input mask/vectors.");
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, V1);
+ }
+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Cost += getBuildVectorCost(VL, Root);
+ if (!Root) {
+ assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
+ // FIXME: Need to find a way to avoid use of getNullValue here.
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ if (isa<UndefValue>(V)) {
+ Vals.push_back(cast<Constant>(V));
+ continue;
+ }
+ Vals.push_back(Constant::getNullValue(V->getType()));
+ }
+ return ConstantVector::get(Vals);
+ }
+ return ConstantVector::getSplat(
+ ElementCount::getFixed(VL.size()),
+ Constant::getNullValue(VL.front()->getType()));
+ }
+ /// Finalize emission of the shuffles.
+ InstructionCost
+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
+ IsFinalized = true;
+ if (Action) {
+ const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Cost += createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ assert(VF > 0 &&
+ "Expected vector length for the final value before action.");
+ Value *V = Vec.dyn_cast<Value *>();
+ if (!Vec.isNull() && !V)
+ V = Constant::getNullValue(FixedVectorType::get(
+ Vec.get<const TreeEntry *>()->Scalars.front()->getType(),
+ CommonMask.size()));
+ Action(V, CommonMask);
+ }
+ ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
+ if (CommonMask.empty())
+ return Cost;
+ int Limit = CommonMask.size() * 2;
+ if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(CommonMask))
+ return Cost;
+ return Cost +
+ createShuffle(InVectors.front(),
+ InVectors.size() == 2 ? InVectors.back() : nullptr,
+ CommonMask);
+ }
+
+ ~ShuffleCostEstimator() {
+ assert((IsFinalized || CommonMask.empty()) &&
+ "Shuffle construction must be finalized.");
+ }
+};
+
+InstructionCost
+BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
+ SmallPtrSetImpl<Value *> &CheckedExtracts) {
+ ArrayRef<Value *> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ // If we have computed a smaller type for the expression, update VecTy so
+ // that the costs will be accurate.
+ if (MinBWs.count(VL[0]))
+ VecTy = FixedVectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ unsigned EntryVF = E->getVectorFactor();
+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
+ ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
+ CheckedExtracts);
+ unsigned VF = E->getVectorFactor();
+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+ E->ReuseShuffleIndices.end());
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
@@ -6709,195 +7315,104 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
SmallVector<int> Mask;
+ SmallVector<int> ExtractMask;
+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
SmallVector<const TreeEntry *> Entries;
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ // Check for gathered extracts.
+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+ SmallVector<Value *> IgnoredVals;
+ if (UserIgnoreList)
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+
+ bool Resized = false;
+ if (Value *VecBase = Estimator.adjustExtracts(
+ E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+
// Do not try to look for reshuffled loads for gathered loads (they will be
// handled later), for vectorized scalars, and cases, which are definitely
// not profitable (splats and small gather nodes.)
- if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||
+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ E->isAltShuffle() ||
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
if (GatherShuffle) {
- // Remove shuffled elements from list of gathers.
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
- if (Mask[I] != UndefMaskElem)
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
- }
assert((Entries.size() == 1 || Entries.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
- InstructionCost GatherCost = 0;
- int Limit = Mask.size() * 2;
- if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask)) {
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n");
- if (NeedToShuffleReuses)
- GatherCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- } else {
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with "
- << *VL.front() << ".\n");
- // Detected that instead of gather we can emit a shuffle of single/two
- // previously vectorized nodes. Add the cost of the permutation rather
- // than gather.
- ::addMask(Mask, E->ReuseShuffleIndices);
- GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
- }
- if (!all_of(GatheredScalars, UndefValue::classof))
- GatherCost += getGatherCost(GatheredScalars);
- return GatherCost;
- }
- if ((E->getOpcode() == Instruction::ExtractElement ||
- all_of(E->Scalars,
- [](Value *V) {
- return isa<ExtractElementInst, UndefValue>(V);
- })) &&
- allSameType(VL)) {
- // Check that gather of extractelements can be represented as just a
- // shuffle of a single/two vectors the scalars are extracted from.
- SmallVector<int> Mask;
- std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
- isFixedVectorShuffle(VL, Mask);
- if (ShuffleKind) {
- // Found the bunch of extractelement instructions that must be gathered
- // into a vector and can be represented as a permutation elements in a
- // single input vector or of 2 input vectors.
- InstructionCost Cost =
- computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- AdjustExtractsCost(Cost);
- if (NeedToShuffleReuses)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- return Cost;
- }
- }
- if (isSplat(VL)) {
- // Found the broadcasting of the single scalar, calculate the cost as the
- // broadcast.
- assert(VecTy == FinalVecTy &&
- "No reused scalars expected for broadcast.");
- const auto *It =
- find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
- // If all values are undefs - consider cost free.
- if (It == VL.end())
- return TTI::TCC_Free;
- // Add broadcast for non-identity shuffle only.
- bool NeedShuffle =
- VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof);
- InstructionCost InsertCost =
- TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
- /*Index=*/0, PoisonValue::get(VecTy), *It);
- return InsertCost + (NeedShuffle
- ? TTI->getShuffleCost(
- TargetTransformInfo::SK_Broadcast, VecTy,
- /*Mask=*/std::nullopt, CostKind,
- /*Index=*/0,
- /*SubTp=*/nullptr, /*Args=*/VL[0])
- : TTI::TCC_Free);
- }
- InstructionCost ReuseShuffleCost = 0;
- if (NeedToShuffleReuses)
- ReuseShuffleCost = TTI->getShuffleCost(
- TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
- // Improve gather cost for gather of loads, if we can group some of the
- // loads into vector loads.
- if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
- !E->isAltShuffle()) {
- BoUpSLP::ValueSet VectorizedLoads;
- unsigned StartIdx = 0;
- unsigned VF = VL.size() / 2;
- unsigned VectorizedCnt = 0;
- unsigned ScatterVectorizeCnt = 0;
- const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
- for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
- for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
- Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
- if (!VectorizedLoads.count(Slice.front()) &&
- !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
- LoadsState LS =
- canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
- *TLI, CurrentOrder, PointerOps);
- switch (LS) {
- case LoadsState::Vectorize:
- case LoadsState::ScatterVectorize:
- // Mark the vectorized loads so that we don't vectorize them
- // again.
- if (LS == LoadsState::Vectorize)
- ++VectorizedCnt;
- else
- ++ScatterVectorizeCnt;
- VectorizedLoads.insert(Slice.begin(), Slice.end());
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += VF;
- break;
- case LoadsState::Gather:
- break;
- }
+ // Restore the mask for previous partially matched values.
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (isa<PoisonValue>(V)) {
+ Mask[I] = PoisonMaskElem;
+ continue;
}
+ if (Mask[I] == PoisonMaskElem)
+ Mask[I] = Entries.front()->findLaneForValue(V);
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= VL.size())
- break;
- // Found vectorizable parts - exit.
- if (!VectorizedLoads.empty())
- break;
+ Estimator.add(Entries.front(), Mask);
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
- if (!VectorizedLoads.empty()) {
- InstructionCost GatherCost = 0;
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
- bool NeedInsertSubvectorAnalysis =
- !NumParts || (VL.size() / VF) > NumParts;
- // Get the cost for gathered loads.
- for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
- if (VectorizedLoads.contains(VL[I]))
- continue;
- GatherCost += getGatherCost(VL.slice(I, VF));
- }
- // The cost for vectorized loads.
- InstructionCost ScalarsCost = 0;
- for (Value *V : VectorizedLoads) {
- auto *LI = cast<LoadInst>(V);
- ScalarsCost +=
- TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
- LI->getAlign(), LI->getPointerAddressSpace(),
- CostKind, TTI::OperandValueInfo(), LI);
- }
- auto *LI = cast<LoadInst>(E->getMainOp());
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
- Align Alignment = LI->getAlign();
- GatherCost +=
- VectorizedCnt *
- TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
- GatherCost += ScatterVectorizeCnt *
- TTI->getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
- if (NeedInsertSubvectorAnalysis) {
- // Add the cost for the subvectors insert.
- for (int I = VF, E = VL.size(); I < E; I += VF)
- GatherCost +=
- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
- std::nullopt, CostKind, I, LoadTy);
- }
- return ReuseShuffleCost + GatherCost - ScalarsCost;
+ if (!Resized) {
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
}
+ // Remove shuffled elements from list of gathers.
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ if (Mask[I] != PoisonMaskElem)
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
+ << " entries for bundle that starts with "
+ << *VL.front() << ".\n";);
+ if (Entries.size() == 1)
+ Estimator.add(Entries.front(), Mask);
+ else
+ Estimator.add(Entries.front(), Entries.back(), Mask);
+ if (all_of(GatheredScalars, PoisonValue ::classof))
+ return Estimator.finalize(E->ReuseShuffleIndices);
+ return Estimator.finalize(
+ E->ReuseShuffleIndices, E->Scalars.size(),
+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+ Vec = Estimator.gather(GatheredScalars,
+ Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(),
+ GatheredScalars.size())));
+ });
}
- return ReuseShuffleCost + getGatherCost(VL);
+ if (!all_of(GatheredScalars, PoisonValue::classof)) {
+ auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
+ bool SameGathers = VL.equals(Gathers);
+ Value *BV = Estimator.gather(
+ Gathers, SameGathers ? nullptr
+ : Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(),
+ GatheredScalars.size())));
+ SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
+ std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
+ Estimator.add(BV, ReuseMask);
+ }
+ if (ExtractShuffle)
+ Estimator.add(E, std::nullopt);
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
@@ -6945,48 +7460,89 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
InstructionCost VecCost = VectorCost(CommonCost);
- LLVM_DEBUG(
- dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));
- // Disable warnings for `this` and `E` are unused. Required for
- // `dumpTreeCosts`.
- (void)this;
- (void)E;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
+ ScalarCost, "Calculated costs for Tree"));
return VecCost - ScalarCost;
};
// Calculate cost difference from vectorizing set of GEPs.
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
- InstructionCost CostSavings = 0;
- for (Value *V : Ptrs) {
- if (V == BasePtr)
- continue;
- auto *Ptr = dyn_cast<GetElementPtrInst>(V);
- // GEPs may contain just addresses without instructions, considered free.
- // GEPs with all constant indices also considered to have zero cost.
- if (!Ptr || Ptr->hasAllConstantIndices())
- continue;
-
- // Here we differentiate two cases: when GEPs represent a regular
- // vectorization tree node (and hence vectorized) and when the set is
- // arguments of a set of loads or stores being vectorized. In the former
- // case all the scalar GEPs will be removed as a result of vectorization.
+ InstructionCost ScalarCost = 0;
+ InstructionCost VecCost = 0;
+ // Here we differentiate two cases: (1) when Ptrs represent a regular
+ // vectorization tree node (as they are pointer arguments of scattered
+ // loads) or (2) when Ptrs are the arguments of loads or stores being
+ // vectorized as plane wide unit-stride load/store since all the
+ // loads/stores are known to be from/to adjacent locations.
+ assert(E->State == TreeEntry::Vectorize &&
+ "Entry state expected to be Vectorize here.");
+ if (isa<LoadInst, StoreInst>(VL0)) {
+ // Case 2: estimate costs for pointer related costs when vectorizing to
+ // a wide load/store.
+ // Scalar cost is estimated as a set of pointers with known relationship
+ // between them.
+ // For vector code we will use BasePtr as argument for the wide load/store
+ // but we also need to account all the instructions which are going to
+ // stay in vectorized code due to uses outside of these scalar
+ // loads/stores.
+ ScalarCost = TTI->getPointersChainCost(
+ Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
+ CostKind);
+
+ SmallVector<const Value *> PtrsRetainedInVecCode;
+ for (Value *V : Ptrs) {
+ if (V == BasePtr) {
+ PtrsRetainedInVecCode.push_back(V);
+ continue;
+ }
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+ // For simplicity assume Ptr to stay in vectorized code if it's not a
+ // GEP instruction. We don't care since it's cost considered free.
+ // TODO: We should check for any uses outside of vectorizable tree
+ // rather than just single use.
+ if (!Ptr || !Ptr->hasOneUse())
+ PtrsRetainedInVecCode.push_back(V);
+ }
+
+ if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
+ // If all pointers stay in vectorized code then we don't have
+ // any savings on that.
+ LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,
+ "Calculated GEPs cost for Tree"));
+ return InstructionCost{TTI::TCC_Free};
+ }
+ VecCost = TTI->getPointersChainCost(
+ PtrsRetainedInVecCode, BasePtr,
+ TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
+ } else {
+ // Case 1: Ptrs are the arguments of loads that we are going to transform
+ // into masked gather load intrinsic.
+ // All the scalar GEPs will be removed as a result of vectorization.
// For any external uses of some lanes extract element instructions will
- // be generated (which cost is estimated separately). For the latter case
- // since the set of GEPs itself is not vectorized those used more than
- // once will remain staying in vectorized code as well. So we should not
- // count them as savings.
- if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0))
- continue;
-
- // TODO: it is target dependent, so need to implement and then use a TTI
- // interface.
- CostSavings += TTI->getArithmeticInstrCost(Instruction::Add,
- Ptr->getType(), CostKind);
- }
- LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n";
- E->dump());
- LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n");
- return InstructionCost() - CostSavings;
+ // be generated (which cost is estimated separately).
+ TTI::PointersChainInfo PtrsInfo =
+ all_of(Ptrs,
+ [](const Value *V) {
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+ return Ptr && !Ptr->hasAllConstantIndices();
+ })
+ ? TTI::PointersChainInfo::getUnknownStride()
+ : TTI::PointersChainInfo::getKnownStride();
+
+ ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
+ CostKind);
+ if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
+ SmallVector<const Value *> Indices(BaseGEP->indices());
+ VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),
+ BaseGEP->getPointerOperand(), Indices, VecTy,
+ CostKind);
+ }
+ }
+
+ LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
+ "Calculated GEPs cost for Tree"));
+
+ return VecCost - ScalarCost;
};
switch (ShuffleOrOp) {
@@ -7062,7 +7618,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
- SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
unsigned OffsetBeg = *getInsertIndex(VL.front());
unsigned OffsetEnd = OffsetBeg;
InsertMask[OffsetBeg] = 0;
@@ -7099,13 +7655,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
+ Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
} else {
- Mask.assign(VecSz, UndefMaskElem);
+ Mask.assign(VecSz, PoisonMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
}
bool IsIdentity = true;
- SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
+ SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
@@ -7148,14 +7704,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InsertVecTy);
} else {
for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
- Mask[I] = InMask.test(I) ? UndefMaskElem : I;
+ Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
I <= End; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Mask[I] = I + VecSz;
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
Mask[I] =
- ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I;
+ ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
}
}
@@ -7422,11 +7978,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost +=
TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
- VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(),
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
CI0->getPredicate(), CostKind, VL0);
VecCost += TTI->getCmpSelInstrCost(
- E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ E->getOpcode(), VecTy, MaskTy,
cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
E->getAltOp());
} else {
@@ -7615,7 +8171,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
InstructionCost Cost = 0;
- SmallPtrSet<Instruction*, 4> LiveValues;
+ SmallPtrSet<Instruction *, 4> LiveValues;
Instruction *PrevInst = nullptr;
// The entries in VectorizableTree are not necessarily ordered by their
@@ -7626,6 +8182,8 @@ InstructionCost BoUpSLP::getSpillCost() const {
// are grouped together. Using dominance ensures a deterministic order.
SmallVector<Instruction *, 16> OrderedScalars;
for (const auto &TEPtr : VectorizableTree) {
+ if (TEPtr->State != TreeEntry::Vectorize)
+ continue;
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
continue;
@@ -7639,7 +8197,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA != NodeB)
- return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
+ return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
return B->comesBefore(A);
});
@@ -7698,7 +8256,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
};
// Debug information does not impact spill cost.
- if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
+ if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
&*PrevInstIt != PrevInst)
NumCalls++;
@@ -7706,7 +8264,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
}
if (NumCalls) {
- SmallVector<Type*, 4> V;
+ SmallVector<Type *, 4> V;
for (auto *II : LiveValues) {
auto *ScalarTy = II->getType();
if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
@@ -7797,8 +8355,8 @@ static T *performExtractsShuffleAction(
ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
- if (Mask[Idx] == UndefMaskElem)
- Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx;
+ if (Mask[Idx] == PoisonMaskElem)
+ Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
else
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
}
@@ -7827,8 +8385,8 @@ static T *performExtractsShuffleAction(
// can shuffle them directly.
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (SecMask[I] != UndefMaskElem) {
- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (SecMask[I] != PoisonMaskElem) {
+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
Mask[I] = SecMask[I] + Vec1VF;
}
}
@@ -7841,12 +8399,12 @@ static T *performExtractsShuffleAction(
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (Mask[I] != UndefMaskElem) {
- assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (Mask[I] != PoisonMaskElem) {
+ assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
if (Res1.second)
Mask[I] = I;
- } else if (SecMask[I] != UndefMaskElem) {
- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ } else if (SecMask[I] != PoisonMaskElem) {
+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
}
}
@@ -7863,11 +8421,11 @@ static T *performExtractsShuffleAction(
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (SecMask[I] != UndefMaskElem) {
- assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+ if (SecMask[I] != PoisonMaskElem) {
+ assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
"Multiple uses of scalars.");
Mask[I] = (Res.second ? I : SecMask[I]) + VF;
- } else if (Mask[I] != UndefMaskElem) {
+ } else if (Mask[I] != PoisonMaskElem) {
Mask[I] = I;
}
}
@@ -7877,12 +8435,23 @@ static T *performExtractsShuffleAction(
}
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
+ // Build a map for gathered scalars to the nodes where they are used.
+ ValueToGatherNodes.clear();
+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
+ if (EntryPtr->State != TreeEntry::NeedToGather)
+ continue;
+ for (Value *V : EntryPtr->Scalars)
+ if (!isConstant(V))
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
+ EntryPtr.get());
+ }
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+ SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
if (TE.State == TreeEntry::NeedToGather) {
@@ -7898,7 +8467,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
- InstructionCost C = getEntryCost(&TE, VectorizedVals);
+ InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
@@ -7951,7 +8520,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
(void)ShuffleMasks.emplace_back();
SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
@@ -7965,7 +8534,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
do {
IEBase = cast<InsertElementInst>(Base);
int Idx = *getInsertIndex(IEBase);
- assert(Mask[Idx] == UndefMaskElem &&
+ assert(Mask[Idx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[Idx] = Idx;
Base = IEBase->getOperand(0);
@@ -7985,7 +8554,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
int InIdx = *InsertIdx;
SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[InIdx] = EU.Lane;
DemandedElts[VecId].setBit(InIdx);
continue;
@@ -8024,7 +8593,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
(all_of(Mask,
[VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
!ShuffleVectorInst::isIdentityMask(Mask)))) {
- SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+ SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
OrigMask.begin());
C = TTI->getShuffleCost(
@@ -8110,17 +8679,23 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// No need to check for the topmost gather node.
if (TE == VectorizableTree.front().get())
return std::nullopt;
- Mask.assign(VL.size(), UndefMaskElem);
+ Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
Instruction &UserInst =
getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);
- auto *PHI = dyn_cast<PHINode>(&UserInst);
- auto *NodeUI = DT->getNode(
- PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx)
- : UserInst.getParent());
+ BasicBlock *ParentBB = nullptr;
+ // Main node of PHI entries keeps the correct order of operands/incoming
+ // blocks.
+ if (auto *PHI =
+ dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {
+ ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);
+ } else {
+ ParentBB = UserInst.getParent();
+ }
+ auto *NodeUI = DT->getNode(ParentBB);
assert(NodeUI && "Should only process reachable instructions");
SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
auto CheckOrdering = [&](Instruction *LastEI) {
@@ -8147,45 +8722,6 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
return false;
return true;
};
- // Build a lists of values to tree entries.
- DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
- if (EntryPtr.get() == TE)
- continue;
- if (EntryPtr->State != TreeEntry::NeedToGather)
- continue;
- if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) {
- return GatheredScalars.contains(V);
- }))
- continue;
- assert(EntryPtr->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
- Instruction &EntryUserInst =
- getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE);
- if (&UserInst == &EntryUserInst) {
- // If 2 gathers are operands of the same entry, compare operands indices,
- // use the earlier one as the base.
- if (TE->UserTreeIndices.front().UserTE ==
- EntryPtr->UserTreeIndices.front().UserTE &&
- TE->UserTreeIndices.front().EdgeIdx <
- EntryPtr->UserTreeIndices.front().EdgeIdx)
- continue;
- }
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst);
- auto *EntryI =
- EntryPHI
- ? EntryPHI
- ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx)
- ->getTerminator()
- : &EntryUserInst;
- if (!CheckOrdering(EntryI))
- continue;
- for (Value *V : EntryPtr->Scalars)
- if (!isConstant(V))
- ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
- }
// Find all tree entries used by the gathered values. If no common entries
// found - not a shuffle.
// Here we build a set of tree nodes for each gathered value and trying to
@@ -8195,16 +8731,58 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// have a permutation of 2 input vectors.
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
- for (Value *V : TE->Scalars) {
+ for (Value *V : VL) {
if (isConstant(V))
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
- auto It = ValueToTEs.find(V);
- if (It != ValueToTEs.end())
- VToTEs = It->second;
- if (const TreeEntry *VTE = getTreeEntry(V))
+ for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
+ if (TEPtr == TE)
+ continue;
+ assert(any_of(TEPtr->Scalars,
+ [&](Value *V) { return GatheredScalars.contains(V); }) &&
+ "Must contain at least single gathered value.");
+ assert(TEPtr->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
+ PHINode *EntryPHI =
+ dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());
+ Instruction *EntryUserInst =
+ EntryPHI ? nullptr
+ : &getLastInstructionInBundle(
+ TEPtr->UserTreeIndices.front().UserTE);
+ if (&UserInst == EntryUserInst) {
+ assert(!EntryPHI && "Unexpected phi node entry.");
+ // If 2 gathers are operands of the same entry, compare operands
+ // indices, use the earlier one as the base.
+ if (TE->UserTreeIndices.front().UserTE ==
+ TEPtr->UserTreeIndices.front().UserTE &&
+ TE->UserTreeIndices.front().EdgeIdx <
+ TEPtr->UserTreeIndices.front().EdgeIdx)
+ continue;
+ }
+ // Check if the user node of the TE comes after user node of EntryPtr,
+ // otherwise EntryPtr depends on TE.
+ auto *EntryI =
+ EntryPHI
+ ? EntryPHI
+ ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)
+ ->getTerminator()
+ : EntryUserInst;
+ if ((ParentBB != EntryI->getParent() ||
+ TE->UserTreeIndices.front().EdgeIdx <
+ TEPtr->UserTreeIndices.front().EdgeIdx ||
+ TE->UserTreeIndices.front().UserTE !=
+ TEPtr->UserTreeIndices.front().UserTE) &&
+ !CheckOrdering(EntryI))
+ continue;
+ VToTEs.insert(TEPtr);
+ }
+ if (const TreeEntry *VTE = getTreeEntry(V)) {
+ Instruction &EntryUserInst = getLastInstructionInBundle(VTE);
+ if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))
+ continue;
VToTEs.insert(VTE);
+ }
if (VToTEs.empty())
continue;
if (UsedTEs.empty()) {
@@ -8260,13 +8838,13 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
});
- if (It != FirstEntries.end()) {
+ if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
Entries.push_back(*It);
std::iota(Mask.begin(), Mask.end(), 0);
// Clear undef scalars.
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
- if (isa<PoisonValue>(TE->Scalars[I]))
- Mask[I] = UndefMaskElem;
+ if (isa<PoisonValue>(VL[I]))
+ Mask[I] = PoisonMaskElem;
return TargetTransformInfo::SK_PermuteSingleSrc;
}
// No perfect match, just shuffle, so choose the first tree node from the
@@ -8302,10 +8880,18 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
break;
}
}
- // No 2 source vectors with the same vector factor - give up and do regular
- // gather.
- if (Entries.empty())
- return std::nullopt;
+ // No 2 source vectors with the same vector factor - just choose 2 with max
+ // index.
+ if (Entries.empty()) {
+ Entries.push_back(
+ *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(),
+ [](const TreeEntry *TE1, const TreeEntry *TE2) {
+ return TE1->Idx < TE2->Idx;
+ }));
+ Entries.push_back(SecondEntries.front());
+ VF = std::max(Entries.front()->getVectorFactor(),
+ Entries.back()->getVectorFactor());
+ }
}
bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);
@@ -8427,19 +9013,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
return std::nullopt;
}
-InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
- const APInt &ShuffledIndices,
- bool NeedToShuffle) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost Cost =
- TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
- /*Extract*/ false, CostKind);
- if (NeedToShuffle)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
- return Cost;
-}
-
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
+ bool ForPoisonSrc) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -8451,20 +9026,36 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// shuffle candidates.
APInt ShuffledElements = APInt::getZero(VL.size());
DenseSet<Value *> UniqueElements;
- // Iterate in reverse order to consider insert elements with the high cost.
- for (unsigned I = VL.size(); I > 0; --I) {
- unsigned Idx = I - 1;
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost Cost;
+ auto EstimateInsertCost = [&](unsigned I, Value *V) {
+ if (!ForPoisonSrc)
+ Cost +=
+ TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
+ I, Constant::getNullValue(VecTy), V);
+ };
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Value *V = VL[I];
// No need to shuffle duplicates for constants.
- if (isConstant(VL[Idx])) {
- ShuffledElements.setBit(Idx);
+ if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
+ ShuffledElements.setBit(I);
continue;
}
- if (!UniqueElements.insert(VL[Idx]).second) {
+ if (!UniqueElements.insert(V).second) {
DuplicateNonConst = true;
- ShuffledElements.setBit(Idx);
+ ShuffledElements.setBit(I);
+ continue;
}
+ EstimateInsertCost(I, V);
}
- return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
+ if (ForPoisonSrc)
+ Cost =
+ TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ if (DuplicateNonConst)
+ Cost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ return Cost;
}
// Perform operand reordering on the instructions in VL and return the reordered
@@ -8483,6 +9074,9 @@ void BoUpSLP::reorderInputsAccordingToOpcode(
}
Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
+ auto &Res = EntryToLastInstruction.FindAndConstruct(E);
+ if (Res.second)
+ return *Res.second;
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block (except for extractelement-like instructions with
// constant indeces).
@@ -8497,7 +9091,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
isVectorLikeInstWithConstOps(I);
}));
- auto &&FindLastInst = [E, Front, this, &BB]() {
+ auto FindLastInst = [&]() {
Instruction *LastInst = Front;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
@@ -8508,9 +9102,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
LastInst = I;
continue;
}
- assert(isVectorLikeInstWithConstOps(LastInst) &&
- isVectorLikeInstWithConstOps(I) &&
- "Expected vector-like insts only.");
+ assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(LastInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(LastInst->getParent())) {
LastInst = I;
continue;
@@ -8531,7 +9127,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return LastInst;
};
- auto &&FindFirstInst = [E, Front, this]() {
+ auto FindFirstInst = [&]() {
Instruction *FirstInst = Front;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
@@ -8542,9 +9138,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
FirstInst = I;
continue;
}
- assert(isVectorLikeInstWithConstOps(FirstInst) &&
- isVectorLikeInstWithConstOps(I) &&
- "Expected vector-like insts only.");
+ assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(FirstInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(FirstInst->getParent())) {
FirstInst = I;
continue;
@@ -8566,22 +9164,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// Set the insert point to the beginning of the basic block if the entry
// should not be scheduled.
- if (E->State != TreeEntry::NeedToGather &&
- (doesNotNeedToSchedule(E->Scalars) ||
+ if (doesNotNeedToSchedule(E->Scalars) ||
+ (E->State != TreeEntry::NeedToGather &&
all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
- Instruction *InsertInst;
- if (all_of(E->Scalars, [](Value *V) {
+ if ((E->getOpcode() == Instruction::GetElementPtr &&
+ any_of(E->Scalars,
+ [](Value *V) {
+ return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
+ })) ||
+ all_of(E->Scalars, [](Value *V) {
return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
}))
- InsertInst = FindLastInst();
+ Res.second = FindLastInst();
else
- InsertInst = FindFirstInst();
- return *InsertInst;
+ Res.second = FindFirstInst();
+ return *Res.second;
}
- // The last instruction in the bundle in program order.
- Instruction *LastInst = nullptr;
-
// Find the last instruction. The common case should be that BB has been
// scheduled, and the last instruction is VL.back(). So we start with
// VL.back() and iterate over schedule data until we reach the end of the
@@ -8594,7 +9193,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
- LastInst = Bundle->Inst;
+ Res.second = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
@@ -8615,15 +9214,15 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
- if (!LastInst)
- LastInst = FindLastInst();
- assert(LastInst && "Failed to find last instruction in bundle");
- return *LastInst;
+ if (!Res.second)
+ Res.second = FindLastInst();
+ assert(Res.second && "Failed to find last instruction in bundle");
+ return *Res.second;
}
void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
auto *Front = E->getMainOp();
- Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
// If the instruction is PHI, set the insert point after all the PHIs.
bool IsPHI = isa<PHINode>(LastInst);
@@ -8641,7 +9240,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
// List of instructions/lanes from current block and/or the blocks which are
// part of the current loop. These instructions will be inserted at the end to
// make it possible to optimize loops and hoist invariant instructions out of
@@ -8658,7 +9257,8 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
for (int I = 0, E = VL.size(); I < E; ++I) {
if (auto *Inst = dyn_cast<Instruction>(VL[I]))
if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
- getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
+ getTreeEntry(Inst) ||
+ (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
PostponedIndices.insert(I).second)
PostponedInsts.emplace_back(Inst, I);
}
@@ -8681,7 +9281,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
Value *Val0 =
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
- Value *Vec = PoisonValue::get(VecTy);
+ Value *Vec = Root ? Root : PoisonValue::get(VecTy);
SmallVector<int> NonConsts;
// Insert constant values at first.
for (int I = 0, E = VL.size(); I < E; ++I) {
@@ -8691,6 +9291,18 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
NonConsts.push_back(I);
continue;
}
+ if (Root) {
+ if (!isa<UndefValue>(VL[I])) {
+ NonConsts.push_back(I);
+ continue;
+ }
+ if (isa<PoisonValue>(VL[I]))
+ continue;
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
+ if (SV->getMaskValue(I) == PoisonMaskElem)
+ continue;
+ }
+ }
Vec = CreateInsertElement(Vec, VL[I], I);
}
// Insert non-constant values.
@@ -8789,6 +9401,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
}
return Vec;
}
+ Value *createIdentity(Value *V) { return V; }
+ Value *createPoison(Type *Ty, unsigned VF) {
+ return PoisonValue::get(FixedVectorType::get(Ty, VF));
+ }
/// Resizes 2 input vector to match the sizes, if the they are not equal
/// yet. The smallest vector is resized to the size of the larger vector.
void resizeToMatch(Value *&V1, Value *&V2) {
@@ -8798,7 +9414,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
int VF = std::max(V1VF, V2VF);
int MinVF = std::min(V1VF, V2VF);
- SmallVector<int> IdentityMask(VF, UndefMaskElem);
+ SmallVector<int> IdentityMask(VF, PoisonMaskElem);
std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
0);
Value *&Op = MinVF == V1VF ? V1 : V2;
@@ -8821,7 +9437,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
assert(V1 && "Expected at least one vector value.");
ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
R.CSEBlocks);
- return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
+ return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
+ ShuffleBuilder);
}
/// Transforms mask \p CommonMask per given \p Mask to make proper set after
@@ -8829,7 +9446,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
ArrayRef<int> Mask) {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
}
@@ -8837,6 +9454,39 @@ public:
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
: Builder(Builder), R(R) {}
+ /// Adjusts extractelements after reusing them.
+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+ Value *VecBase = nullptr;
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ int Idx = Mask[I];
+ if (Idx == PoisonMaskElem)
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ VecBase = EI->getVectorOperand();
+ // If the only one use is vectorized - can delete the extractelement
+ // itself.
+ if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
+ return !R.ScalarToTreeEntry.count(U);
+ }))
+ continue;
+ R.eraseInstruction(EI);
+ }
+ return VecBase;
+ }
+ /// Checks if the specified entry \p E needs to be delayed because of its
+ /// dependency nodes.
+ Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+ // No need to delay emission if all deps are ready.
+ if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
+ return nullptr;
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
+ E->getVectorFactor());
+ return Builder.CreateAlignedLoad(
+ VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
+ MaybeAlign());
+ }
/// Adds 2 input vectors and the mask for their shuffling.
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -8849,15 +9499,15 @@ public:
Value *Vec = InVectors.front();
if (InVectors.size() == 2) {
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
- transformMaskAfterShuffle(CommonMask, Mask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
} else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Mask.size()) {
Vec = createShuffle(Vec, nullptr, CommonMask);
- transformMaskAfterShuffle(CommonMask, Mask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
}
V1 = createShuffle(V1, V2, Mask);
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx + Sz;
InVectors.front() = Vec;
if (InVectors.size() == 2)
@@ -8870,7 +9520,7 @@ public:
if (InVectors.empty()) {
if (!isa<FixedVectorType>(V1->getType())) {
V1 = createShuffle(V1, nullptr, CommonMask);
- CommonMask.assign(Mask.size(), UndefMaskElem);
+ CommonMask.assign(Mask.size(), PoisonMaskElem);
transformMaskAfterShuffle(CommonMask, Mask);
}
InVectors.push_back(V1);
@@ -8892,7 +9542,7 @@ public:
transformMaskAfterShuffle(CommonMask, CommonMask);
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)
+ if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] =
V->getType() != V1->getType()
? Idx + Sz
@@ -8910,7 +9560,7 @@ public:
// Check if second vector is required if the used elements are already
// used from the first one.
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
InVectors.push_back(V1);
break;
}
@@ -8919,7 +9569,7 @@ public:
if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
VF = FTy->getNumElements();
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
}
/// Adds another one input vector and the mask for the shuffling.
@@ -8928,17 +9578,46 @@ public:
inversePermutation(Order, NewMask);
add(V1, NewMask);
}
+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ return R.gather(VL, Root);
+ }
+ Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
/// Finalize emission of the shuffles.
+ /// \param Action the action (if any) to be performed before final applying of
+ /// the \p ExtMask mask.
Value *
- finalize(ArrayRef<int> ExtMask = std::nullopt) {
+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
+ if (Action) {
+ Value *Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Vec = createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ assert(VF > 0 &&
+ "Expected vector length for the final value before action.");
+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (VecVF < VF) {
+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);
+ std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
+ Vec = createShuffle(Vec, nullptr, ResizeMask);
+ }
+ Action(Vec, CommonMask);
+ InVectors.front() = Vec;
+ }
if (!ExtMask.empty()) {
if (CommonMask.empty()) {
CommonMask.assign(ExtMask.begin(), ExtMask.end());
} else {
- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
- if (ExtMask[I] == UndefMaskElem)
+ if (ExtMask[I] == PoisonMaskElem)
continue;
NewMask[I] = CommonMask[ExtMask[I]];
}
@@ -9009,18 +9688,18 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
// ... (use %2)
// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
// br %block
- SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
+ SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);
SmallSet<int, 4> UsedIdxs;
int Pos = 0;
for (int Idx : VE->ReuseShuffleIndices) {
- if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem &&
+ if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&
UsedIdxs.insert(Idx).second)
UniqueIdxs[Idx] = Pos;
++Pos;
}
assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
"less than original vector size.");
- UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
+ UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);
V = FinalShuffle(V, UniqueIdxs);
} else {
assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
@@ -9031,6 +9710,21 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
V = FinalShuffle(V, UniformMask);
}
}
+ // Need to update the operand gather node, if actually the operand is not a
+ // vectorized node, but the buildvector/gather node, which matches one of
+ // the vectorized nodes.
+ if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+ }) == VE->UserTreeIndices.end()) {
+ auto *It = find_if(
+ VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ TE->UserTreeIndices.front().UserTE == E &&
+ TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
+ });
+ assert(It != VectorizableTree.end() && "Expected gather node operand.");
+ (*It)->VectorizedValue = V;
+ }
return V;
}
}
@@ -9049,108 +9743,370 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->getOpcode() != Instruction::InsertElement &&
E->getOpcode() != Instruction::PHI) {
- Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
Builder.SetInsertPoint(LastInst);
}
return vectorizeTree(I->get());
}
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+template <typename BVTy, typename ResTy, typename... Args>
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
- SmallVector<Value *> Gathered(
- VF, PoisonValue::get(E->Scalars.front()->getType()));
bool NeedFreeze = false;
- SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the redorder indices and reorder scalars per this mask.
+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+ E->ReuseShuffleIndices.end());
+ SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
SmallVector<int> ReorderMask;
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
- reorderScalars(VL, ReorderMask);
- SmallVector<int> ReuseMask(VF, UndefMaskElem);
- if (!allConstant(VL)) {
+ reorderScalars(GatheredScalars, ReorderMask);
+ auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+ if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }))
+ return false;
+ TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
+ unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+ if (UserTE->getNumOperands() != 2)
+ return false;
+ auto *It =
+ find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+ return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
+ }) != TE->UserTreeIndices.end();
+ });
+ if (It == VectorizableTree.end())
+ return false;
+ unsigned I =
+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
+ int Sz = Mask.size();
+ if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask))
+ std::iota(Mask.begin(), Mask.end(), 0);
+ else
+ std::fill(Mask.begin(), Mask.end(), I);
+ return true;
+ };
+ BVTy ShuffleBuilder(Params...);
+ ResTy Res = ResTy();
+ SmallVector<int> Mask;
+ SmallVector<int> ExtractMask;
+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+ SmallVector<const TreeEntry *> Entries;
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ if (!all_of(GatheredScalars, UndefValue::classof)) {
+ // Check for gathered extracts.
+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+ SmallVector<Value *> IgnoredVals;
+ if (UserIgnoreList)
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+ bool Resized = false;
+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ // Gather extracts after we check for full matched gathers only.
+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ E->isAltShuffle() ||
+ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+ isSplat(E->Scalars) ||
+ (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ }
+ if (GatherShuffle) {
+ if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+ // Delay emission of gathers which are not ready yet.
+ PostponedGathers.insert(E);
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ return Delayed;
+ }
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: perfect diamond match for gather bundle that starts with "
+ << *E->Scalars.front() << ".\n");
+ // Restore the mask for previous partially matched values.
+ if (Entries.front()->ReorderIndices.empty() &&
+ ((Entries.front()->ReuseShuffleIndices.empty() &&
+ E->Scalars.size() == Entries.front()->Scalars.size()) ||
+ (E->Scalars.size() ==
+ Entries.front()->ReuseShuffleIndices.size()))) {
+ std::iota(Mask.begin(), Mask.end(), 0);
+ } else {
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (isa<PoisonValue>(V)) {
+ Mask[I] = PoisonMaskElem;
+ continue;
+ }
+ Mask[I] = Entries.front()->findLaneForValue(V);
+ }
+ }
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ Res = ShuffleBuilder.finalize(E->getCommonMask());
+ return Res;
+ }
+ if (!Resized) {
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ // Remove shuffled elements from list of gathers.
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ if (Mask[I] != PoisonMaskElem)
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ }
+ }
+ auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
+ SmallVectorImpl<int> &ReuseMask,
+ bool IsRootPoison) {
// For splats with can emit broadcasts instead of gathers, so try to find
// such sequences.
- bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back());
+ bool IsSplat = IsRootPoison && isSplat(Scalars) &&
+ (Scalars.size() > 2 || Scalars.front() == Scalars.back());
+ Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
SmallVector<int> UndefPos;
DenseMap<Value *, unsigned> UniquePositions;
// Gather unique non-const values and all constant values.
// For repeated values, just shuffle them.
- for (auto [I, V] : enumerate(VL)) {
+ int NumNonConsts = 0;
+ int SinglePos = 0;
+ for (auto [I, V] : enumerate(Scalars)) {
if (isa<UndefValue>(V)) {
if (!isa<PoisonValue>(V)) {
- Gathered[I] = V;
ReuseMask[I] = I;
UndefPos.push_back(I);
}
continue;
}
if (isConstant(V)) {
- Gathered[I] = V;
ReuseMask[I] = I;
continue;
}
+ ++NumNonConsts;
+ SinglePos = I;
+ Value *OrigV = V;
+ Scalars[I] = PoisonValue::get(ScalarTy);
if (IsSplat) {
- Gathered.front() = V;
+ Scalars.front() = OrigV;
ReuseMask[I] = 0;
} else {
- const auto Res = UniquePositions.try_emplace(V, I);
- Gathered[Res.first->second] = V;
+ const auto Res = UniquePositions.try_emplace(OrigV, I);
+ Scalars[Res.first->second] = OrigV;
ReuseMask[I] = Res.first->second;
}
}
- if (!UndefPos.empty() && IsSplat) {
+ if (NumNonConsts == 1) {
+ // Restore single insert element.
+ if (IsSplat) {
+ ReuseMask.assign(VF, PoisonMaskElem);
+ std::swap(Scalars.front(), Scalars[SinglePos]);
+ if (!UndefPos.empty() && UndefPos.front() == 0)
+ Scalars.front() = UndefValue::get(ScalarTy);
+ }
+ ReuseMask[SinglePos] = SinglePos;
+ } else if (!UndefPos.empty() && IsSplat) {
// For undef values, try to replace them with the simple broadcast.
// We can do it if the broadcasted value is guaranteed to be
// non-poisonous, or by freezing the incoming scalar value first.
- auto *It = find_if(Gathered, [this, E](Value *V) {
+ auto *It = find_if(Scalars, [this, E](Value *V) {
return !isa<UndefValue>(V) &&
(getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
- any_of(V->uses(), [E](const Use &U) {
- // Check if the value already used in the same operation in
- // one of the nodes already.
- return E->UserTreeIndices.size() == 1 &&
- is_contained(
- E->UserTreeIndices.front().UserTE->Scalars,
- U.getUser()) &&
- E->UserTreeIndices.front().EdgeIdx != U.getOperandNo();
- }));
+ (E->UserTreeIndices.size() == 1 &&
+ any_of(V->uses(), [E](const Use &U) {
+ // Check if the value already used in the same operation in
+ // one of the nodes already.
+ return E->UserTreeIndices.front().EdgeIdx !=
+ U.getOperandNo() &&
+ is_contained(
+ E->UserTreeIndices.front().UserTE->Scalars,
+ U.getUser());
+ })));
});
- if (It != Gathered.end()) {
+ if (It != Scalars.end()) {
// Replace undefs by the non-poisoned scalars and emit broadcast.
- int Pos = std::distance(Gathered.begin(), It);
+ int Pos = std::distance(Scalars.begin(), It);
for_each(UndefPos, [&](int I) {
// Set the undef position to the non-poisoned scalar.
ReuseMask[I] = Pos;
- // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already.
+ // Replace the undef by the poison, in the mask it is replaced by
+ // non-poisoned scalar already.
if (I != Pos)
- Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ Scalars[I] = PoisonValue::get(ScalarTy);
});
} else {
// Replace undefs by the poisons, emit broadcast and then emit
// freeze.
for_each(UndefPos, [&](int I) {
- ReuseMask[I] = UndefMaskElem;
- if (isa<UndefValue>(Gathered[I]))
- Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ ReuseMask[I] = PoisonMaskElem;
+ if (isa<UndefValue>(Scalars[I]))
+ Scalars[I] = PoisonValue::get(ScalarTy);
});
NeedFreeze = true;
}
}
+ };
+ if (ExtractShuffle || GatherShuffle) {
+ bool IsNonPoisoned = true;
+ bool IsUsedInExpr = false;
+ Value *Vec1 = nullptr;
+ if (ExtractShuffle) {
+ // Gather of extractelements can be represented as just a shuffle of
+ // a single/two vectors the scalars are extracted from.
+ // Find input vectors.
+ Value *Vec2 = nullptr;
+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+ if (ExtractMask[I] == PoisonMaskElem ||
+ (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+ ExtractMask[I] = PoisonMaskElem;
+ continue;
+ }
+ if (isa<UndefValue>(E->Scalars[I]))
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ if (!Vec1) {
+ Vec1 = EI->getVectorOperand();
+ } else if (Vec1 != EI->getVectorOperand()) {
+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+ "Expected only 1 or 2 vectors shuffle.");
+ Vec2 = EI->getVectorOperand();
+ }
+ }
+ if (Vec2) {
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+ ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
+ } else if (Vec1) {
+ IsUsedInExpr = FindReusedSplat(ExtractMask);
+ ShuffleBuilder.add(Vec1, ExtractMask);
+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
+ } else {
+ ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
+ ScalarTy, GatheredScalars.size())),
+ ExtractMask);
+ }
+ }
+ if (GatherShuffle) {
+ if (Entries.size() == 1) {
+ IsUsedInExpr = FindReusedSplat(Mask);
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+ } else {
+ ShuffleBuilder.add(Entries.front()->VectorizedValue,
+ Entries.back()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+ }
+ }
+ // Try to figure out best way to combine values: build a shuffle and insert
+ // elements or just build several shuffles.
+ // Insert non-constant scalars.
+ SmallVector<Value *> NonConstants(GatheredScalars);
+ int EMSz = ExtractMask.size();
+ int MSz = Mask.size();
+ // Try to build constant vector and shuffle with it only if currently we
+ // have a single permutation and more than 1 scalar constants.
+ bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+ bool IsIdentityShuffle =
+ (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
+ ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
+ (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(Mask, [&](int I) { return I >= MSz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask));
+ bool EnoughConstsForShuffle =
+ IsSingleShuffle &&
+ (none_of(GatheredScalars,
+ [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }) ||
+ any_of(GatheredScalars,
+ [](Value *V) {
+ return isa<Constant>(V) && !isa<UndefValue>(V);
+ })) &&
+ (!IsIdentityShuffle ||
+ (GatheredScalars.size() == 2 &&
+ any_of(GatheredScalars,
+ [](Value *V) { return !isa<UndefValue>(V); })) ||
+ count_if(GatheredScalars, [](Value *V) {
+ return isa<Constant>(V) && !isa<PoisonValue>(V);
+ }) > 1);
+ // NonConstants array contains just non-constant values, GatheredScalars
+ // contains only constant to build final vector and then shuffle.
+ for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+ if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
+ NonConstants[I] = PoisonValue::get(ScalarTy);
+ else
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ // Generate constants for final shuffle and build a mask for them.
+ if (!all_of(GatheredScalars, PoisonValue::classof)) {
+ SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
+ TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ ShuffleBuilder.add(BV, BVMask);
+ }
+ if (all_of(NonConstants, [=](Value *V) {
+ return isa<PoisonValue>(V) ||
+ (IsSingleShuffle && ((IsIdentityShuffle &&
+ IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
+ }))
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ else
+ Res = ShuffleBuilder.finalize(
+ E->ReuseShuffleIndices, E->Scalars.size(),
+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+ TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
+ Vec = ShuffleBuilder.gather(NonConstants, Vec);
+ });
+ } else if (!allConstant(GatheredScalars)) {
+ // Gather unique scalars and all constants.
+ SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
+ TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ ShuffleBuilder.add(BV, ReuseMask);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
- ReuseMask.clear();
- copy(VL, Gathered.begin());
+ // Gather all constants.
+ SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (!isa<PoisonValue>(V))
+ Mask[I] = I;
+ }
+ Value *BV = ShuffleBuilder.gather(E->Scalars);
+ ShuffleBuilder.add(BV, Mask);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
}
- // Gather unique scalars and all constants.
- Value *Vec = gather(Gathered);
- ShuffleBuilder.add(Vec, ReuseMask);
- Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+
if (NeedFreeze)
- Vec = Builder.CreateFreeze(Vec);
- return Vec;
+ Res = ShuffleBuilder.createFreeze(Res);
+ return Res;
+}
+
+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+ return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
+ *this);
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -9161,10 +10117,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
+ if (E->State == TreeEntry::NeedToGather) {
+ if (E->getMainOp() && E->Idx == 0)
+ setInsertPointAfterBundle(E);
+ Value *Vec = createBuildVector(E);
+ E->VectorizedValue = Vec;
+ return Vec;
+ }
+
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
- if (E->State != TreeEntry::NeedToGather &&
- E->getOpcode() == Instruction::Store) {
+ if (E->getOpcode() == Instruction::Store) {
ArrayRef<int> Mask =
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
@@ -9175,45 +10138,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
};
- if (E->State == TreeEntry::NeedToGather) {
- if (E->Idx > 0) {
- // We are in the middle of a vectorizable chain. We need to gather the
- // scalars from the users.
- Value *Vec = createBuildVector(E);
- E->VectorizedValue = Vec;
- return Vec;
- }
- if (E->getMainOp())
- setInsertPointAfterBundle(E);
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the reorder indices and reorder scalars per this
- // mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
- if (!ReorderMask.empty())
- reorderScalars(GatheredScalars, ReorderMask);
- Value *Vec;
- SmallVector<int> Mask;
- SmallVector<const TreeEntry *> Entries;
- std::optional<TargetTransformInfo::ShuffleKind> Shuffle =
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
- if (Shuffle) {
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
- Entries.back()->VectorizedValue, Mask);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleExtractSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- } else {
- Vec = gather(E->Scalars);
- }
- Vec = FinalShuffle(Vec, E);
- E->VectorizedValue = Vec;
- return Vec;
- }
-
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
@@ -9248,7 +10172,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
- SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+ SmallPtrSet<BasicBlock *, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -9314,14 +10238,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(NumElts - NumScalars, UndefMaskElem);
+ Mask.append(NumElts - NumScalars, PoisonMaskElem);
} else {
- Mask.assign(NumElts, UndefMaskElem);
+ Mask.assign(NumElts, PoisonMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
}
// Create InsertVector shuffle if necessary
bool IsIdentity = true;
- SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
Value *Scalar = E->Scalars[PrevMask[I]];
@@ -9337,9 +10261,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
}
- SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
for (unsigned I = 0; I < NumElts; I++) {
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
InsertMask[Offset + I] = I;
}
SmallBitVector UseMask =
@@ -9354,10 +10278,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
if (!IsFirstPoison.all()) {
for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I))
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
InsertMask[I] = I + NumElts;
}
- }
+ }
V = Builder.CreateShuffleVector(
V,
IsFirstPoison.all() ? PoisonValue::get(V->getType())
@@ -9372,8 +10296,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallBitVector IsFirstPoison =
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == UndefMaskElem)
- InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I;
+ if (InsertMask[I] == PoisonMaskElem)
+ InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
else
InsertMask[I] += NumElts;
}
@@ -9544,20 +10468,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LoadInst *LI = cast<LoadInst>(VL0);
Instruction *NewLI;
- unsigned AS = LI->getPointerAddressSpace();
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
- Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
+ NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
- // The pointer operand uses an in-tree scalar so we add the new BitCast
- // or LoadInst to ExternalUses list to make sure that an extract will
+ // The pointer operand uses an in-tree scalar so we add the new
+ // LoadInst to ExternalUses list to make sure that an extract will
// be generated in the future.
if (TreeEntry *Entry = getTreeEntry(PO)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(
- PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);
+ ExternalUses.emplace_back(PO, NewLI, FoundLane);
}
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
@@ -9653,7 +10574,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E);
- Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
@@ -9665,8 +10586,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
- SmallVector<Type *, 2> TysForDecl =
- {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+ SmallVector<Type *, 2> TysForDecl;
+ // Add return type if intrinsic is overloaded on it.
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
+ TysForDecl.push_back(
+ FixedVectorType::get(CI->getType(), E->Scalars.size()));
for (int j = 0, e = CI->arg_size(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
@@ -9808,14 +10732,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
default:
- llvm_unreachable("unknown inst");
+ llvm_unreachable("unknown inst");
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree() {
ExtraValueToDebugLocsMap ExternallyUsedValues;
- return vectorizeTree(ExternallyUsedValues);
+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
+ return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
}
namespace {
@@ -9829,28 +10754,51 @@ struct ShuffledInsertData {
};
} // namespace
-Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot) {
+Value *BoUpSLP::vectorizeTree(
+ const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
+ Instruction *ReductionRoot) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
}
-
- // Pre-gather last instructions.
- for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) {
- if ((E->State == TreeEntry::NeedToGather &&
- (!E->getMainOp() || E->Idx > 0)) ||
- (E->State != TreeEntry::NeedToGather &&
- E->getOpcode() == Instruction::ExtractValue) ||
- E->getOpcode() == Instruction::InsertElement)
- continue;
- Instruction *LastInst = &getLastInstructionInBundle(E.get());
- EntryToLastInstruction.try_emplace(E.get(), LastInst);
- }
+ // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
+ // need to rebuild it.
+ EntryToLastInstruction.clear();
Builder.SetInsertPoint(ReductionRoot ? ReductionRoot
: &F->getEntryBlock().front());
auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+ // Run through the list of postponed gathers and emit them, replacing the temp
+ // emitted allocas with actual vector instructions.
+ ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
+ DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
+ for (const TreeEntry *E : PostponedNodes) {
+ auto *TE = const_cast<TreeEntry *>(E);
+ if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
+ if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
+ TE->UserTreeIndices.front().EdgeIdx)))
+ // Found gather node which is absolutely the same as one of the
+ // vectorized nodes. It may happen after reordering.
+ continue;
+ auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
+ TE->VectorizedValue = nullptr;
+ auto *UserI =
+ cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
+ Builder.SetInsertPoint(PrevVec);
+ Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
+ Value *Vec = vectorizeTree(TE);
+ PrevVec->replaceAllUsesWith(Vec);
+ PostponedValues.try_emplace(Vec).first->second.push_back(TE);
+ // Replace the stub vector node, if it was used before for one of the
+ // buildvector nodes already.
+ auto It = PostponedValues.find(PrevVec);
+ if (It != PostponedValues.end()) {
+ for (TreeEntry *VTE : It->getSecond())
+ VTE->VectorizedValue = Vec;
+ }
+ eraseInstruction(PrevVec);
+ }
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
@@ -9968,14 +10916,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- auto &NewInstLocs = ExternallyUsedValues[NewInst];
- auto It = ExternallyUsedValues.find(Scalar);
- assert(It != ExternallyUsedValues.end() &&
- "Externally used scalar is not found in ExternallyUsedValues");
- NewInstLocs.append(It->second);
- ExternallyUsedValues.erase(Scalar);
// Required to update internally referenced instructions.
Scalar->replaceAllUsesWith(NewInst);
+ ReplacedExternals.emplace_back(Scalar, NewInst);
continue;
}
@@ -10004,7 +10947,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
ShuffledInserts.size() - 1);
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
@@ -10017,7 +10960,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
do {
IEBase = cast<InsertElementInst>(Base);
int IEIdx = *getInsertIndex(IEBase);
- assert(Mask[Idx] == UndefMaskElem &&
+ assert(Mask[Idx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[IEIdx] = IEIdx;
Base = IEBase->getOperand(0);
@@ -10035,7 +10978,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[Idx] = ExternalUse.Lane;
It->InsertElements.push_back(cast<InsertElementInst>(User));
continue;
@@ -10077,8 +11020,8 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (Mask[I] < VF)
@@ -10103,9 +11046,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
return std::make_pair(Vec, true);
}
if (!ForSingleMask) {
- SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);
for (unsigned I = 0; I < VF; ++I) {
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
ResizeMask[Mask[I]] = Mask[I];
}
Vec = CreateShuffle(Vec, nullptr, ResizeMask);
@@ -10308,14 +11251,14 @@ void BoUpSLP::optimizeGatherSequence() {
// registers.
unsigned LastUndefsCnt = 0;
for (int I = 0, E = NewMask.size(); I < E; ++I) {
- if (SM1[I] == UndefMaskElem)
+ if (SM1[I] == PoisonMaskElem)
++LastUndefsCnt;
else
LastUndefsCnt = 0;
- if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+ if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
NewMask[I] != SM1[I])
return false;
- if (NewMask[I] == UndefMaskElem)
+ if (NewMask[I] == PoisonMaskElem)
NewMask[I] = SM1[I];
}
// Check if the last undefs actually change the final number of used vector
@@ -10590,11 +11533,20 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
+ // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
+ // against the budget. Otherwise debug info could affect codegen.
BasicBlock::reverse_iterator UpIter =
++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
+ auto IsAssumeLikeIntr = [](const Instruction &I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ return II->isAssumeLikeIntrinsic();
+ return false;
+ };
+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
&*DownIter != I) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
@@ -10604,6 +11556,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
++UpIter;
++DownIter;
+
+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
}
if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
assert(I->getParent() == ScheduleStart->getParent() &&
@@ -10804,7 +11759,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
unsigned numAliased = 0;
unsigned DistToSrc = 1;
- for ( ; DepDest; DepDest = DepDest->NextLoadStore) {
+ for (; DepDest; DepDest = DepDest->NextLoadStore) {
assert(isInSchedulingRegion(DepDest));
// We have two limits to reduce the complexity:
@@ -11163,8 +12118,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// we can truncate the roots to this narrower type.
for (auto *Root : TreeRoot) {
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
- MaxBitWidth = std::max<unsigned>(
- Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+ MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
+ MaxBitWidth);
}
// True if the roots can be zero-extended back to their original type, rather
@@ -11223,8 +12178,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}
// Round MaxBitWidth up to the next power-of-two.
- if (!isPowerOf2_64(MaxBitWidth))
- MaxBitWidth = NextPowerOf2(MaxBitWidth);
+ MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
// If the maximum bit width we compute is less than the with of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
@@ -11242,60 +12196,6 @@ void BoUpSLP::computeMinimumValueSizes() {
MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
-namespace {
-
-/// The SLPVectorizer Pass.
-struct SLPVectorizer : public FunctionPass {
- SLPVectorizerPass Impl;
-
- /// Pass identification, replacement for typeid
- static char ID;
-
- explicit SLPVectorizer() : FunctionPass(ID) {
- initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
- }
-
- bool doInitialization(Module &M) override { return false; }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- FunctionPass::getAnalysisUsage(AU);
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
@@ -11536,7 +12436,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Operands[0]);
- unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
MaxElts);
@@ -11618,17 +12518,8 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
}
}
-bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
- if (!A || !B)
- return false;
- if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B))
- return false;
- Value *VL[] = {A, B};
- return tryToVectorizeList(VL, R);
-}
-
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- bool LimitForRegisterSize) {
+ bool MaxVFOnly) {
if (VL.size() < 2)
return false;
@@ -11663,7 +12554,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = R.getMinVF(Sz);
- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
@@ -11690,21 +12581,17 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
- unsigned OpsWidth = 0;
+ unsigned ActualVF = std::min(MaxInst - I, VF);
- if (I + VF > MaxInst)
- OpsWidth = MaxInst - I;
- else
- OpsWidth = VF;
-
- if (!isPowerOf2_32(OpsWidth))
+ if (!isPowerOf2_32(ActualVF))
continue;
- if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
- (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+ if (MaxVFOnly && ActualVF < MaxVF)
+ break;
+ if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
break;
- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
// Check that a previous iteration of this loop did not delete the Value.
if (llvm::any_of(Ops, [&R](Value *V) {
auto *I = dyn_cast<Instruction>(V);
@@ -11712,7 +12599,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}))
continue;
- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
<< "\n");
R.buildTree(Ops);
@@ -11730,7 +12617,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
MinCost = std::min(MinCost, Cost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
- << " for VF=" << OpsWidth << "\n");
+ << " for VF=" << ActualVF << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
@@ -11806,14 +12693,14 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
}
if (Candidates.size() == 1)
- return tryToVectorizePair(Op0, Op1, R);
+ return tryToVectorizeList({Op0, Op1}, R);
// We have multiple options. Try to pick the single best.
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
if (!BestCandidate)
return false;
- return tryToVectorizePair(Candidates[*BestCandidate].first,
- Candidates[*BestCandidate].second, R);
+ return tryToVectorizeList(
+ {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
}
namespace {
@@ -11857,6 +12744,9 @@ class HorizontalReduction {
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
+ /// Checks if the optimization of original scalar identity operations on
+ /// matched horizontal reductions is enabled and allowed.
+ bool IsSupportedHorRdxIdentityOp = false;
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -11888,6 +12778,9 @@ class HorizontalReduction {
return I->getFastMathFlags().noNaNs();
}
+ if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
+ return true;
+
return I->isAssociative();
}
@@ -11905,6 +12798,7 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ bool IsConstant = isConstant(LHS) && isConstant(RHS);
switch (Kind) {
case RecurKind::Or:
if (UseSelect &&
@@ -11926,29 +12820,49 @@ class HorizontalReduction {
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::FMax:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
case RecurKind::FMin:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ minnum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
+ case RecurKind::FMaximum:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ maximum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
+ case RecurKind::FMinimum:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ minimum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
case RecurKind::SMax:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
case RecurKind::SMin:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
case RecurKind::UMax:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
case RecurKind::UMin:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
@@ -11984,6 +12898,7 @@ class HorizontalReduction {
return Op;
}
+public:
static RecurKind getRdxKind(Value *V) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
@@ -12010,6 +12925,10 @@ class HorizontalReduction {
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
return RecurKind::FMin;
+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
+ return RecurKind::FMaximum;
+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
+ return RecurKind::FMinimum;
// This matches either cmp+select or intrinsics. SLP is expected to handle
// either form.
// TODO: If we are canonicalizing to intrinsics, we can remove several
@@ -12086,6 +13005,7 @@ class HorizontalReduction {
return isCmpSelMinMax(I) ? 1 : 0;
}
+private:
/// Total number of operands in the reduction operation.
static unsigned getNumberOfOperands(Instruction *I) {
return isCmpSelMinMax(I) ? 3 : 2;
@@ -12134,17 +13054,6 @@ class HorizontalReduction {
}
}
- static Value *getLHS(RecurKind Kind, Instruction *I) {
- if (Kind == RecurKind::None)
- return nullptr;
- return I->getOperand(getFirstOperandIndex(I));
- }
- static Value *getRHS(RecurKind Kind, Instruction *I) {
- if (Kind == RecurKind::None)
- return nullptr;
- return I->getOperand(getFirstOperandIndex(I) + 1);
- }
-
static bool isGoodForReduction(ArrayRef<Value *> Data) {
int Sz = Data.size();
auto *I = dyn_cast<Instruction>(Data.front());
@@ -12156,65 +13065,39 @@ public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+ bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
ScalarEvolution &SE, const DataLayout &DL,
const TargetLibraryInfo &TLI) {
- assert((!Phi || is_contained(Phi->operands(), Inst)) &&
- "Phi needs to use the binary operator");
- assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
- isa<IntrinsicInst>(Inst)) &&
- "Expected binop, select, or intrinsic for reduction matching");
- RdxKind = getRdxKind(Inst);
-
- // We could have a initial reductions that is not an add.
- // r *= v1 + v2 + v3 + v4
- // In such a case start looking for a tree rooted in the first '+'.
- if (Phi) {
- if (getLHS(RdxKind, Inst) == Phi) {
- Phi = nullptr;
- Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst));
- if (!Inst)
- return false;
- RdxKind = getRdxKind(Inst);
- } else if (getRHS(RdxKind, Inst) == Phi) {
- Phi = nullptr;
- Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst));
- if (!Inst)
- return false;
- RdxKind = getRdxKind(Inst);
- }
- }
-
- if (!isVectorizable(RdxKind, Inst))
+ RdxKind = HorizontalReduction::getRdxKind(Root);
+ if (!isVectorizable(RdxKind, Root))
return false;
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
- Type *Ty = Inst->getType();
+ Type *Ty = Root->getType();
if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
// Though the ultimate reduction may have multiple uses, its condition must
// have only single use.
- if (auto *Sel = dyn_cast<SelectInst>(Inst))
+ if (auto *Sel = dyn_cast<SelectInst>(Root))
if (!Sel->getCondition()->hasOneUse())
return false;
- ReductionRoot = Inst;
+ ReductionRoot = Root;
// Iterate through all the operands of the possible reduction tree and
// gather all the reduced values, sorting them by their value id.
- BasicBlock *BB = Inst->getParent();
- bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
- SmallVector<Instruction *> Worklist(1, Inst);
+ BasicBlock *BB = Root->getParent();
+ bool IsCmpSelMinMax = isCmpSelMinMax(Root);
+ SmallVector<Instruction *> Worklist(1, Root);
// Checks if the operands of the \p TreeN instruction are also reduction
// operations or should be treated as reduced values or an extra argument,
// which is not part of the reduction.
- auto &&CheckOperands = [this, IsCmpSelMinMax,
- BB](Instruction *TreeN,
- SmallVectorImpl<Value *> &ExtraArgs,
- SmallVectorImpl<Value *> &PossibleReducedVals,
- SmallVectorImpl<Instruction *> &ReductionOps) {
+ auto CheckOperands = [&](Instruction *TreeN,
+ SmallVectorImpl<Value *> &ExtraArgs,
+ SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps) {
for (int I = getFirstOperandIndex(TreeN),
End = getNumberOfOperands(TreeN);
I < End; ++I) {
@@ -12229,10 +13112,14 @@ public:
}
// If the edge is not an instruction, or it is different from the main
// reduction opcode or has too many uses - possible reduced value.
+ // Also, do not try to reduce const values, if the operation is not
+ // foldable.
if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
- !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+ !isVectorizable(RdxKind, EdgeInst) ||
+ (R.isAnalyzedReductionRoot(EdgeInst) &&
+ all_of(EdgeInst->operands(), Constant::classof))) {
PossibleReducedVals.push_back(EdgeVal);
continue;
}
@@ -12246,10 +13133,43 @@ public:
// instructions (grouping them by the predicate).
MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
PossibleReducedVals;
- initReductionOps(Inst);
+ initReductionOps(Root);
DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
SmallSet<size_t, 2> LoadKeyUsed;
SmallPtrSet<Value *, 4> DoNotReverseVals;
+
+ auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(Ptr);
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI : LIt->second) {
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ DoNotReverseVals.insert(RLI);
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey =
+ hash_value(LIt->second.back()->getPointerOperand());
+ DoNotReverseVals.insert(LIt->second.back());
+ return SubKey;
+ }
+ }
+ }
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
+ return hash_value(LI->getPointerOperand());
+ };
+
while (!Worklist.empty()) {
Instruction *TreeN = Worklist.pop_back_val();
SmallVector<Value *> Args;
@@ -12269,41 +13189,8 @@ public:
// results.
for (Value *V : PossibleRedVals) {
size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- V, &TLI,
- [&](size_t Key, LoadInst *LI) {
- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
- if (LoadKeyUsed.contains(Key)) {
- auto LIt = LoadsMap.find(Ptr);
- if (LIt != LoadsMap.end()) {
- for (LoadInst *RLI: LIt->second) {
- if (getPointersDiff(
- RLI->getType(), RLI->getPointerOperand(),
- LI->getType(), LI->getPointerOperand(), DL, SE,
- /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- for (LoadInst *RLI : LIt->second) {
- if (arePointersCompatible(RLI->getPointerOperand(),
- LI->getPointerOperand(), TLI)) {
- hash_code SubKey = hash_value(RLI->getPointerOperand());
- DoNotReverseVals.insert(RLI);
- return SubKey;
- }
- }
- if (LIt->second.size() > 2) {
- hash_code SubKey =
- hash_value(LIt->second.back()->getPointerOperand());
- DoNotReverseVals.insert(LIt->second.back());
- return SubKey;
- }
- }
- }
- LoadKeyUsed.insert(Key);
- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
+ std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(V, 0))
.first->second;
@@ -12312,40 +13199,8 @@ public:
PossibleReductionOps.rend());
} else {
size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- TreeN, &TLI,
- [&](size_t Key, LoadInst *LI) {
- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
- if (LoadKeyUsed.contains(Key)) {
- auto LIt = LoadsMap.find(Ptr);
- if (LIt != LoadsMap.end()) {
- for (LoadInst *RLI: LIt->second) {
- if (getPointersDiff(RLI->getType(),
- RLI->getPointerOperand(), LI->getType(),
- LI->getPointerOperand(), DL, SE,
- /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- for (LoadInst *RLI : LIt->second) {
- if (arePointersCompatible(RLI->getPointerOperand(),
- LI->getPointerOperand(), TLI)) {
- hash_code SubKey = hash_value(RLI->getPointerOperand());
- DoNotReverseVals.insert(RLI);
- return SubKey;
- }
- }
- if (LIt->second.size() > 2) {
- hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand());
- DoNotReverseVals.insert(LIt->second.back());
- return SubKey;
- }
- }
- }
- LoadKeyUsed.insert(Key);
- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
+ std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(TreeN, 0))
.first->second;
@@ -12407,14 +13262,18 @@ public:
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- size_t NumReducedVals =
+ unsigned NumReducedVals =
std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
- [](size_t Num, ArrayRef<Value *> Vals) {
+ [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
if (!isGoodForReduction(Vals))
return Num;
return Num + Vals.size();
});
- if (NumReducedVals < ReductionLimit) {
+ if (NumReducedVals < ReductionLimit &&
+ (!AllowHorRdxIdenityOptimization ||
+ all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
+ return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
+ }))) {
for (ReductionOpsType &RdxOps : ReductionOps)
for (Value *RdxOp : RdxOps)
V.analyzedReductionRoot(cast<Instruction>(RdxOp));
@@ -12428,6 +13287,7 @@ public:
DenseMap<Value *, WeakTrackingVH> TrackedVals(
ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
// The same extra argument may be used several times, so log each attempt
// to use it.
@@ -12448,6 +13308,18 @@ public:
return cast<Instruction>(ScalarCond);
};
+ // Return new VectorizedTree, based on previous value.
+ auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
+ if (VectorizedTree) {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
+ ReductionOps);
+ }
+ // Initialize the final value in the reduction.
+ return Res;
+ };
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
@@ -12459,6 +13331,12 @@ public:
continue;
IgnoreList.insert(RdxOp);
}
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (Value *U : IgnoreList)
+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+ RdxFMF &= FPMO->getFastMathFlags();
bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
// Need to track reduced vals, they may be changed during vectorization of
@@ -12519,16 +13397,82 @@ public:
}
}
}
+
+ // Emit code for constant values.
+ if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
+ allConstant(Candidates)) {
+ Value *Res = Candidates.front();
+ ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
+ for (Value *VC : ArrayRef(Candidates).drop_front()) {
+ Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
+ ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ V.analyzedReductionRoot(ResI);
+ }
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
+ continue;
+ }
+
unsigned NumReducedVals = Candidates.size();
- if (NumReducedVals < ReductionLimit)
+ if (NumReducedVals < ReductionLimit &&
+ (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
+ !isSplat(Candidates)))
continue;
+ // Check if we support repeated scalar values processing (optimization of
+ // original scalar identity operations on matched horizontal reductions).
+ IsSupportedHorRdxIdentityOp =
+ AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
+ RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
+ // Gather same values.
+ MapVector<Value *, unsigned> SameValuesCounter;
+ if (IsSupportedHorRdxIdentityOp)
+ for (Value *V : Candidates)
+ ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
+ // Used to check if the reduced values used same number of times. In this
+ // case the compiler may produce better code. E.g. if reduced values are
+ // aabbccdd (8 x values), then the first node of the tree will have a node
+ // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
+ // Plus, the final reduction will be performed on <8 x aabbccdd>.
+ // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
+ // x abcd) * 2.
+ // Currently it only handles add/fadd/xor. and/or/min/max do not require
+ // this analysis, other operations may require an extra estimation of
+ // the profitability.
+ bool SameScaleFactor = false;
+ bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
+ SameValuesCounter.size() != Candidates.size();
+ if (OptReusedScalars) {
+ SameScaleFactor =
+ (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
+ RdxKind == RecurKind::Xor) &&
+ all_of(drop_begin(SameValuesCounter),
+ [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
+ return P.second == SameValuesCounter.front().second;
+ });
+ Candidates.resize(SameValuesCounter.size());
+ transform(SameValuesCounter, Candidates.begin(),
+ [](const auto &P) { return P.first; });
+ NumReducedVals = Candidates.size();
+ // Have a reduction of the same element.
+ if (NumReducedVals == 1) {
+ Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
+ unsigned Cnt = SameValuesCounter.lookup(OrigV);
+ Value *RedVal =
+ emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+ VectorizedVals.try_emplace(OrigV, Cnt);
+ continue;
+ }
+ }
+
unsigned MaxVecRegSize = V.getMaxVecRegSize();
unsigned EltSize = V.getVectorElementSize(Candidates[0]);
- unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);
+ unsigned MaxElts =
+ RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
unsigned ReduxWidth = std::min<unsigned>(
- PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+ llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
unsigned Start = 0;
unsigned Pos = Start;
// Restarts vectorization attempt with lower vector factor.
@@ -12551,6 +13495,7 @@ public:
ReduxWidth /= 2;
return IsAnyRedOpGathered;
};
+ bool AnyVectorized = false;
while (Pos < NumReducedVals - ReduxWidth + 1 &&
ReduxWidth >= ReductionLimit) {
// Dependency in tree of the reduction ops - drop this attempt, try
@@ -12603,34 +13548,24 @@ public:
LocalExternallyUsedValues[TrackedVals[V]];
});
}
- // Number of uses of the candidates in the vector of values.
- SmallDenseMap<Value *, unsigned> NumUses(Candidates.size());
- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
- Value *V = Candidates[Cnt];
- ++NumUses.try_emplace(V, 0).first->getSecond();
- }
- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
- Value *V = Candidates[Cnt];
- ++NumUses.try_emplace(V, 0).first->getSecond();
+ if (!IsSupportedHorRdxIdentityOp) {
+ // Number of uses of the candidates in the vector of values.
+ assert(SameValuesCounter.empty() &&
+ "Reused values counter map is not empty");
+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+ continue;
+ Value *V = Candidates[Cnt];
+ Value *OrigV = TrackedToOrig.find(V)->second;
+ ++SameValuesCounter[OrigV];
+ }
}
SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
// Gather externally used values.
SmallPtrSet<Value *, 4> Visited;
- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
- Value *RdxVal = Candidates[Cnt];
- if (!Visited.insert(RdxVal).second)
+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
continue;
- // Check if the scalar was vectorized as part of the vectorization
- // tree but not the top node.
- if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
- LocalExternallyUsedValues[RdxVal];
- continue;
- }
- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
- LocalExternallyUsedValues[RdxVal];
- }
- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
Value *RdxVal = Candidates[Cnt];
if (!Visited.insert(RdxVal).second)
continue;
@@ -12640,42 +13575,34 @@ public:
LocalExternallyUsedValues[RdxVal];
continue;
}
- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+ unsigned NumOps =
+ VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
+ if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
LocalExternallyUsedValues[RdxVal];
}
+ // Do not need the list of reused scalars in regular mode anymore.
+ if (!IsSupportedHorRdxIdentityOp)
+ SameValuesCounter.clear();
for (Value *RdxVal : VL)
if (RequiredExtract.contains(RdxVal))
LocalExternallyUsedValues[RdxVal];
+ // Update LocalExternallyUsedValues for the scalar, replaced by
+ // extractelement instructions.
+ for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
+ auto It = ExternallyUsedValues.find(Pair.first);
+ if (It == ExternallyUsedValues.end())
+ continue;
+ LocalExternallyUsedValues[Pair.second].append(It->second);
+ }
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (Value *U : IgnoreList)
- if (auto *FPMO = dyn_cast<FPMathOperator>(U))
- RdxFMF &= FPMO->getFastMathFlags();
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
- if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
- Instruction *MainOp = V.getFirstNodeMainOp();
- for (Value *V : VL) {
- auto *VI = dyn_cast<LoadInst>(V);
- // Add the costs of scalar GEP pointers, to be removed from the
- // code.
- if (!VI || VI == MainOp)
- continue;
- auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
- if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
- continue;
- TreeCost -= TTI->getArithmeticInstrCost(
- Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
- }
- }
+ getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
if (!Cost.isValid())
@@ -12716,8 +13643,8 @@ public:
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
// Vectorize a tree.
- Value *VectorizedRoot =
- V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
+ ReplacedExternals, InsertPt);
Builder.SetInsertPoint(InsertPt);
@@ -12727,29 +13654,48 @@ public:
if (isBoolLogicOp(RdxRootInst))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // Emit code to correctly handle reused reduced values, if required.
+ if (OptReusedScalars && !SameScaleFactor) {
+ VectorizedRoot =
+ emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
+ SameValuesCounter, TrackedToOrig);
+ }
+
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(
- cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
- }
+ // Improved analysis for add/fadd/xor reductions with same scale factor
+ // for all operands of reductions. We can emit scalar ops for them
+ // instead.
+ if (OptReusedScalars && SameScaleFactor)
+ ReducedSubTree = emitScaleForReusedOps(
+ ReducedSubTree, Builder, SameValuesCounter.front().second);
+
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
// Count vectorized reduced values to exclude them from final reduction.
for (Value *RdxVal : VL) {
- ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)
- .first->getSecond();
+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+ if (IsSupportedHorRdxIdentityOp) {
+ VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
+ continue;
+ }
+ ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
if (!V.isVectorized(RdxVal))
RequiredExtract.insert(RdxVal);
}
Pos += ReduxWidth;
Start = Pos;
- ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
+ ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
+ AnyVectorized = true;
+ }
+ if (OptReusedScalars && !AnyVectorized) {
+ for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
+ Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+ Value *OrigV = TrackedToOrig.find(P.first)->second;
+ VectorizedVals.try_emplace(OrigV, P.second);
+ }
+ continue;
}
}
if (VectorizedTree) {
@@ -12757,7 +13703,7 @@ public:
// possible problem with poison propagation. If not possible to reorder
// (both operands are originally RHS), emit an extra freeze instruction
// for the LHS operand.
- //I.e., if we have original code like this:
+ // I.e., if we have original code like this:
// RedOp1 = select i1 ?, i1 LHS, i1 false
// RedOp2 = select i1 RHS, i1 ?, i1 false
@@ -12892,7 +13838,8 @@ private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
- unsigned ReduxWidth, FastMathFlags FMF) {
+ bool IsCmpSelMinMax, unsigned ReduxWidth,
+ FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
@@ -12900,7 +13847,36 @@ private:
InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since
// the reduction value can be calculated at the compile time.
- bool AllConsts = all_of(ReducedVals, isConstant);
+ bool AllConsts = allConstant(ReducedVals);
+ auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
+ InstructionCost Cost = 0;
+ // Scalar cost is repeated for N-1 elements.
+ int Cnt = ReducedVals.size();
+ for (Value *RdxVal : ReducedVals) {
+ if (Cnt == 1)
+ break;
+ --Cnt;
+ if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
+ Cost += GenCostFn();
+ continue;
+ }
+ InstructionCost ScalarCost = 0;
+ for (User *U : RdxVal->users()) {
+ auto *RdxOp = cast<Instruction>(U);
+ if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+ ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
+ continue;
+ }
+ ScalarCost = InstructionCost::getInvalid();
+ break;
+ }
+ if (ScalarCost.isValid())
+ Cost += ScalarCost;
+ else
+ Cost += GenCostFn();
+ }
+ return Cost;
+ };
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -12913,52 +13889,32 @@ private:
if (!AllConsts)
VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
- ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+ ScalarCost = EvaluateScalarCost([&]() {
+ return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+ });
break;
}
case RecurKind::FMax:
- case RecurKind::FMin: {
- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- if (!AllConsts) {
- auto *VecCondTy =
- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- VectorCost =
- TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*IsUnsigned=*/false, CostKind);
- }
- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
- ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
- SclCondTy, RdxPred, CostKind) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- SclCondTy, RdxPred, CostKind);
- break;
- }
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
case RecurKind::SMax:
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin: {
- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- if (!AllConsts) {
- auto *VecCondTy =
- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- bool IsUnsigned =
- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- IsUnsigned, CostKind);
- }
- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
- ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
- SclCondTy, RdxPred, CostKind) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- SclCondTy, RdxPred, CostKind);
+ Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
+ if (!AllConsts)
+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ ScalarCost = EvaluateScalarCost([&]() {
+ IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
+ return TTI->getIntrinsicInstrCost(ICA, CostKind);
+ });
break;
}
default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
}
- // Scalar cost is repeated for N-1 elements.
- ScalarCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a splitting reduction)\n");
@@ -12977,8 +13933,148 @@ private:
++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
}
-};
+ /// Emits optimized code for unique scalar value reused \p Cnt times.
+ Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+ unsigned Cnt) {
+ assert(IsSupportedHorRdxIdentityOp &&
+ "The optimization of matched scalar identity horizontal reductions "
+ "must be supported.");
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ // res = mul vv, n
+ Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateMul(VectorizedValue, Scale);
+ }
+ case RecurKind::Xor: {
+ // res = n % 2 ? 0 : vv
+ LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
+ << ". (HorRdx)\n");
+ if (Cnt % 2 == 0)
+ return Constant::getNullValue(VectorizedValue->getType());
+ return VectorizedValue;
+ }
+ case RecurKind::FAdd: {
+ // res = fmul v, n
+ Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateFMul(VectorizedValue, Scale);
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // res = vv
+ return VectorizedValue;
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::SelectICmp:
+ case RecurKind::SelectFCmp:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+ }
+ return nullptr;
+ }
+
+ /// Emits actual operation for the scalar identity values, found during
+ /// horizontal reduction analysis.
+ Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+ ArrayRef<Value *> VL,
+ const MapVector<Value *, unsigned> &SameValuesCounter,
+ const DenseMap<Value *, Value *> &TrackedToOrig) {
+ assert(IsSupportedHorRdxIdentityOp &&
+ "The optimization of matched scalar identity horizontal reductions "
+ "must be supported.");
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ // root = mul prev_root, <1, 1, n, 1>
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
+ }
+ auto *Scale = ConstantVector::get(Vals);
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateMul(VectorizedValue, Scale);
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ // No need for multiple or/and(s).
+ LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
+ << ". (HorRdx)\n");
+ return VectorizedValue;
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // No need for multiple min/max(s) of the same value.
+ LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
+ << ". (HorRdx)\n");
+ return VectorizedValue;
+ case RecurKind::Xor: {
+ // Replace values with even number of repeats with 0, since
+ // x xor x = 0.
+ // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
+ // 7>, if elements 4th and 6th elements have even number of repeats.
+ SmallVector<int> Mask(
+ cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
+ PoisonMaskElem);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ bool NeedShuffle = false;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Value *V = VL[I];
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ if (Cnt % 2 == 0) {
+ Mask[I] = VF;
+ NeedShuffle = true;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
+ : Mask) dbgs()
+ << I << " ";
+ dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
+ if (NeedShuffle)
+ VectorizedValue = Builder.CreateShuffleVector(
+ VectorizedValue,
+ ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
+ return VectorizedValue;
+ }
+ case RecurKind::FAdd: {
+ // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ Vals.push_back(ConstantFP::get(V->getType(), Cnt));
+ }
+ auto *Scale = ConstantVector::get(Vals);
+ return Builder.CreateFMul(VectorizedValue, Scale);
+ }
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::SelectICmp:
+ case RecurKind::SelectFCmp:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for reused scalars.");
+ }
+ return nullptr;
+ }
+};
} // end anonymous namespace
static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
@@ -13075,15 +14171,15 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
return false;
}
-/// Try and get a reduction value from a phi node.
+/// Try and get a reduction instruction from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
///
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
/// if not possible.
-static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
- BasicBlock *ParentBB, LoopInfo *LI) {
+static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
// There are situations where the reduction value is not dominated by the
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
@@ -13092,13 +14188,13 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
};
- Value *Rdx = nullptr;
+ Instruction *Rdx = nullptr;
// Return the incoming value if it comes from the same BB as the phi node.
if (P->getIncomingBlock(0) == ParentBB) {
- Rdx = P->getIncomingValue(0);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
} else if (P->getIncomingBlock(1) == ParentBB) {
- Rdx = P->getIncomingValue(1);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
}
if (Rdx && DominatedReduxValue(Rdx))
@@ -13115,9 +14211,9 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
// There is a loop latch, return the incoming value if it comes from
// that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
- Rdx = P->getIncomingValue(0);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
} else if (P->getIncomingBlock(1) == BBLatch) {
- Rdx = P->getIncomingValue(1);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
}
if (Rdx && DominatedReduxValue(Rdx))
@@ -13133,6 +14229,10 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return true;
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
return true;
+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
+ return true;
if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
return true;
if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
@@ -13144,21 +14244,63 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return false;
}
+/// We could have an initial reduction that is not an add.
+/// r *= v1 + v2 + v3 + v4
+/// In such a case start looking for a tree rooted in the first '+'.
+/// \Returns the new root if found, which may be nullptr if not an instruction.
+static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
+ Instruction *Root) {
+ assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
+ isa<IntrinsicInst>(Root)) &&
+ "Expected binop, select, or intrinsic for reduction matching");
+ Value *LHS =
+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
+ Value *RHS =
+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
+ if (LHS == Phi)
+ return dyn_cast<Instruction>(RHS);
+ if (RHS == Phi)
+ return dyn_cast<Instruction>(LHS);
+ return nullptr;
+}
+
+/// \p Returns the first operand of \p I that does not match \p Phi. If
+/// operand is not an instruction it returns nullptr.
+static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
+ Value *Op0 = nullptr;
+ Value *Op1 = nullptr;
+ if (!matchRdxBop(I, Op0, Op1))
+ return nullptr;
+ return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
+}
+
+/// \Returns true if \p I is a candidate instruction for reduction vectorization.
+static bool isReductionCandidate(Instruction *I) {
+ bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
+ Value *B0 = nullptr, *B1 = nullptr;
+ bool IsBinop = matchRdxBop(I, B0, B1);
+ return IsBinop || IsSelect;
+}
+
bool SLPVectorizerPass::vectorizeHorReduction(
- PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
if (!ShouldVectorizeHor)
return false;
+ bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
- auto *Root = dyn_cast_or_null<Instruction>(V);
- if (!Root)
+ if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
- if (!isa<BinaryOperator>(Root))
- P = nullptr;
+ // If we can find a secondary reduction root, use that instead.
+ auto SelectRoot = [&]() {
+ if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
+ HorizontalReduction::getRdxKind(Root) != RecurKind::None)
+ if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
+ return NewRoot;
+ return Root;
+ };
- if (Root->getParent() != BB || isa<PHINode>(Root))
- return false;
// Start analysis starting from Root instruction. If horizontal reduction is
// found, try to vectorize it. If it is not a horizontal reduction or
// vectorization is not possible or not effective, and currently analyzed
@@ -13171,22 +14313,32 @@ bool SLPVectorizerPass::vectorizeHorReduction(
// If a horizintal reduction was not matched or vectorized we collect
// instructions for possible later attempts for vectorization.
std::queue<std::pair<Instruction *, unsigned>> Stack;
- Stack.emplace(Root, 0);
+ Stack.emplace(SelectRoot(), 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
- auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
if (R.isAnalyzedReductionRoot(Inst))
return nullptr;
- bool IsBinop = matchRdxBop(Inst, B0, B1);
- bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
- if (IsBinop || IsSelect) {
- HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))
- return HorRdx.tryToReduce(R, TTI, *TLI);
+ if (!isReductionCandidate(Inst))
+ return nullptr;
+ HorizontalReduction HorRdx;
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+ return nullptr;
+ return HorRdx.tryToReduce(R, TTI, *TLI);
+ };
+ auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
+ if (TryOperandsAsNewSeeds && FutureSeed == Root) {
+ FutureSeed = getNonPhiOperand(Root, P);
+ if (!FutureSeed)
+ return false;
}
- return nullptr;
+ // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
+ // analysis is done separately.
+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
+ PostponedInsts.push_back(FutureSeed);
+ return true;
};
+
while (!Stack.empty()) {
Instruction *Inst;
unsigned Level;
@@ -13197,37 +14349,19 @@ bool SLPVectorizerPass::vectorizeHorReduction(
// iteration while stack was populated before that happened.
if (R.isDeleted(Inst))
continue;
- Value *B0 = nullptr, *B1 = nullptr;
- if (Value *V = TryToReduce(Inst, B0, B1)) {
+ if (Value *VectorizedV = TryToReduce(Inst)) {
Res = true;
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- if (auto *I = dyn_cast<Instruction>(V)) {
+ if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
// Try to find another reduction.
Stack.emplace(I, Level);
continue;
}
} else {
- bool IsBinop = B0 && B1;
- if (P && IsBinop) {
- Inst = dyn_cast<Instruction>(B0);
- if (Inst == P)
- Inst = dyn_cast<Instruction>(B1);
- if (!Inst) {
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- continue;
- }
+ // We could not vectorize `Inst` so try to use it as a future seed.
+ if (!TryAppendToPostponedInsts(Inst)) {
+ assert(Stack.empty() && "Expected empty stack");
+ break;
}
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
- // analysis is done separately.
- if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
- PostponedInsts.push_back(Inst);
}
// Try to vectorize operands.
@@ -13246,11 +14380,11 @@ bool SLPVectorizerPass::vectorizeHorReduction(
return Res;
}
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
SmallVector<WeakTrackingVH> PostponedInsts;
- bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts);
+ bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
Res |= tryToVectorize(PostponedInsts, R);
return Res;
}
@@ -13297,13 +14431,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
}
template <typename T>
-static bool
-tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
- function_ref<unsigned(T *)> Limit,
- function_ref<bool(T *, T *)> Comparator,
- function_ref<bool(T *, T *)> AreCompatible,
- function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
- bool LimitForRegisterSize) {
+static bool tryToVectorizeSequence(
+ SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
+ function_ref<bool(T *, T *)> AreCompatible,
+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
+ bool MaxVFOnly, BoUpSLP &R) {
bool Changed = false;
// Sort by type, parent, operands.
stable_sort(Incoming, Comparator);
@@ -13331,21 +14463,29 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
// same/alternate ops only, this may result in some extra final
// vectorization.
if (NumElts > 1 &&
- TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+ TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
// Success start over because instructions might have been changed.
Changed = true;
- } else if (NumElts < Limit(*IncIt) &&
- (Candidates.empty() ||
- Candidates.front()->getType() == (*IncIt)->getType())) {
- Candidates.append(IncIt, std::next(IncIt, NumElts));
+ } else {
+ /// \Returns the minimum number of elements that we will attempt to
+ /// vectorize.
+ auto GetMinNumElements = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
+ if (NumElts < GetMinNumElements(*IncIt) &&
+ (Candidates.empty() ||
+ Candidates.front()->getType() == (*IncIt)->getType())) {
+ Candidates.append(IncIt, std::next(IncIt, NumElts));
+ }
}
// Final attempt to vectorize instructions with the same types.
if (Candidates.size() > 1 &&
(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
- if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {
+ if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
// Success start over because instructions might have been changed.
Changed = true;
- } else if (LimitForRegisterSize) {
+ } else if (MaxVFOnly) {
// Try to vectorize using small vectors.
for (auto *It = Candidates.begin(), *End = Candidates.end();
It != End;) {
@@ -13353,9 +14493,8 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
++SameTypeIt;
unsigned NumElts = (SameTypeIt - It);
- if (NumElts > 1 &&
- TryToVectorizeHelper(ArrayRef(It, NumElts),
- /*LimitForRegisterSize=*/false))
+ if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
+ /*MaxVFOnly=*/false))
Changed = true;
It = SameTypeIt;
}
@@ -13378,11 +14517,12 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
/// of the second cmp instruction.
template <bool IsCompatibility>
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
- function_ref<bool(Instruction *)> IsDeleted) {
+ const DominatorTree &DT) {
+ assert(isValidElementType(V->getType()) &&
+ isValidElementType(V2->getType()) &&
+ "Expected valid element types only.");
auto *CI1 = cast<CmpInst>(V);
auto *CI2 = cast<CmpInst>(V2);
- if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
if (CI1->getOperand(0)->getType()->getTypeID() <
CI2->getOperand(0)->getType()->getTypeID())
return !IsCompatibility;
@@ -13411,31 +14551,102 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
return false;
if (auto *I1 = dyn_cast<Instruction>(Op1))
if (auto *I2 = dyn_cast<Instruction>(Op2)) {
- if (I1->getParent() != I2->getParent())
- return false;
+ if (IsCompatibility) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ } else {
+ // Try to compare nodes with same parent.
+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
+ if (!NodeI1)
+ return NodeI2 != nullptr;
+ if (!NodeI2)
+ return false;
+ assert((NodeI1 == NodeI2) ==
+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeI1 != NodeI2)
+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+ }
InstructionsState S = getSameOpcode({I1, I2}, TLI);
- if (S.getOpcode())
+ if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
continue;
- return false;
+ return !IsCompatibility && I1->getOpcode() < I2->getOpcode();
}
}
return IsCompatibility;
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
- BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
+template <typename ItT>
+bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
+ BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ // Try to find reductions first.
+ for (CmpInst *I : CmpInsts) {
+ if (R.isDeleted(I))
+ continue;
+ for (Value *Op : I->operands())
+ if (auto *RootOp = dyn_cast<Instruction>(Op))
+ Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
+ }
+ // Try to vectorize operands as vector bundles.
+ for (CmpInst *I : CmpInsts) {
+ if (R.isDeleted(I))
+ continue;
+ Changed |= tryToVectorize(I, R);
+ }
+ // Try to vectorize list of compares.
+ // Sort by type, compare predicate, etc.
+ auto CompareSorter = [&](Value *V, Value *V2) {
+ if (V == V2)
+ return false;
+ return compareCmp<false>(V, V2, *TLI, *DT);
+ };
+
+ auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ return compareCmp<true>(V1, V2, *TLI, *DT);
+ };
+
+ SmallVector<Value *> Vals;
+ for (Instruction *V : CmpInsts)
+ if (!R.isDeleted(V) && isValidElementType(V->getType()))
+ Vals.push_back(V);
+ if (Vals.size() <= 1)
+ return Changed;
+ Changed |= tryToVectorizeSequence<Value>(
+ Vals, CompareSorter, AreCompatibleCompares,
+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+ // Exclude possible reductions from other blocks.
+ bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
+ return any_of(V->users(), [V](User *U) {
+ auto *Select = dyn_cast<SelectInst>(U);
+ return Select &&
+ Select->getParent() != cast<Instruction>(V)->getParent();
+ });
+ });
+ if (ArePossiblyReducedInOtherBlock)
+ return false;
+ return tryToVectorizeList(Candidates, R, MaxVFOnly);
+ },
+ /*MaxVFOnly=*/true, R);
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
+ BasicBlock *BB, BoUpSLP &R) {
+ assert(all_of(Instructions,
+ [](auto *I) {
+ return isa<InsertElementInst, InsertValueInst>(I);
+ }) &&
+ "This function only accepts Insert instructions");
bool OpsChanged = false;
- SmallVector<Instruction *, 4> PostponedCmps;
SmallVector<WeakTrackingVH> PostponedInsts;
// pass1 - try to vectorize reductions only
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
- if (isa<CmpInst>(I)) {
- PostponedCmps.push_back(I);
- continue;
- }
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
}
// pass2 - try to match and vectorize a buildvector sequence.
@@ -13451,63 +14662,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
// Now try to vectorize postponed instructions.
OpsChanged |= tryToVectorize(PostponedInsts, R);
- if (AtTerminator) {
- // Try to find reductions first.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- for (Value *Op : I->operands())
- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
- }
- // Try to vectorize operands as vector bundles.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- OpsChanged |= tryToVectorize(I, R);
- }
- // Try to vectorize list of compares.
- // Sort by type, compare predicate, etc.
- auto CompareSorter = [&](Value *V, Value *V2) {
- return compareCmp<false>(V, V2, *TLI,
- [&R](Instruction *I) { return R.isDeleted(I); });
- };
-
- auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
- if (V1 == V2)
- return true;
- return compareCmp<true>(V1, V2, *TLI,
- [&R](Instruction *I) { return R.isDeleted(I); });
- };
- auto Limit = [&R](Value *V) {
- unsigned EltSize = R.getVectorElementSize(V);
- return std::max(2U, R.getMaxVecRegSize() / EltSize);
- };
-
- SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
- OpsChanged |= tryToVectorizeSequence<Value>(
- Vals, Limit, CompareSorter, AreCompatibleCompares,
- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
- // Exclude possible reductions from other blocks.
- bool ArePossiblyReducedInOtherBlock =
- any_of(Candidates, [](Value *V) {
- return any_of(V->users(), [V](User *U) {
- return isa<SelectInst>(U) &&
- cast<SelectInst>(U)->getParent() !=
- cast<Instruction>(V)->getParent();
- });
- });
- if (ArePossiblyReducedInOtherBlock)
- return false;
- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
- },
- /*LimitForRegisterSize=*/true);
- Instructions.clear();
- } else {
- Instructions.clear();
- // Insert in reverse order since the PostponedCmps vector was filled in
- // reverse order.
- Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());
- }
+ Instructions.clear();
return OpsChanged;
}
@@ -13603,10 +14758,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return true;
};
- auto Limit = [&R](Value *V) {
- unsigned EltSize = R.getVectorElementSize(V);
- return std::max(2U, R.getMaxVecRegSize() / EltSize);
- };
bool HaveVectorizedPhiNodes = false;
do {
@@ -13648,19 +14799,44 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
- Incoming, Limit, PHICompare, AreCompatiblePHIs,
- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ Incoming, PHICompare, AreCompatiblePHIs,
+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+ return tryToVectorizeList(Candidates, R, MaxVFOnly);
},
- /*LimitForRegisterSize=*/true);
+ /*MaxVFOnly=*/true, R);
Changed |= HaveVectorizedPhiNodes;
VisitedInstrs.insert(Incoming.begin(), Incoming.end());
} while (HaveVectorizedPhiNodes);
VisitedInstrs.clear();
- InstSetVector PostProcessInstructions;
- SmallDenseSet<Instruction *, 4> KeyNodes;
+ InstSetVector PostProcessInserts;
+ SmallSetVector<CmpInst *, 8> PostProcessCmps;
+ // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
+ // also vectorizes `PostProcessCmps`.
+ auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
+ bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
+ if (VectorizeCmps) {
+ Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
+ PostProcessCmps.clear();
+ }
+ PostProcessInserts.clear();
+ return Changed;
+ };
+ // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
+ auto IsInPostProcessInstrs = [&](Instruction *I) {
+ if (auto *Cmp = dyn_cast<CmpInst>(I))
+ return PostProcessCmps.contains(Cmp);
+ return isa<InsertElementInst, InsertValueInst>(I) &&
+ PostProcessInserts.contains(I);
+ };
+ // Returns true if `I` is an instruction without users, like terminator, or
+ // function call with ignored return value, store. Ignore unused instructions
+ // (basing on instruction type, except for CallInst and InvokeInst).
+ auto HasNoUsers = [](Instruction *I) {
+ return I->use_empty() &&
+ (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
+ };
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
@@ -13672,9 +14848,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
- if (it->use_empty() && KeyNodes.contains(&*it) &&
- vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
- it->isTerminator())) {
+ if (HasNoUsers(&*it) &&
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
@@ -13692,8 +14867,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
+ Instruction *Root = getReductionInstr(DT, P, BB, LI);
+ if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -13714,19 +14889,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
- PI && !PostProcessInstructions.contains(PI))
- Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ PI && !IsInPostProcessInstrs(PI))
+ Changed |= vectorizeRootInstruction(nullptr, PI,
P->getIncomingBlock(I), R, TTI);
}
continue;
}
- // Ran into an instruction without users, like terminator, or function call
- // with ignored return value, store. Ignore unused instructions (basing on
- // instruction type, except for CallInst and InvokeInst).
- if (it->use_empty() &&
- (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
- KeyNodes.insert(&*it);
+ if (HasNoUsers(&*it)) {
bool OpsChanged = false;
auto *SI = dyn_cast<StoreInst>(it);
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
@@ -13746,16 +14916,16 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *VI = dyn_cast<Instruction>(V);
- VI && !PostProcessInstructions.contains(VI))
+ VI && !IsInPostProcessInstrs(VI))
// Try to match and vectorize a horizontal reduction.
- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
}
}
// Start vectorization of post-process list of instructions from the
// top-tree instructions to try to vectorize as many instructions as
// possible.
- OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
- it->isTerminator());
+ OpsChanged |=
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -13766,8 +14936,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
}
- if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))
- PostProcessInstructions.insert(&*it);
+ if (isa<InsertElementInst, InsertValueInst>(it))
+ PostProcessInserts.insert(&*it);
+ else if (isa<CmpInst>(it))
+ PostProcessCmps.insert(cast<CmpInst>(&*it));
}
return Changed;
@@ -13928,10 +15100,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
- auto Limit = [&R, this](StoreInst *SI) {
- unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
- return R.getMinVF(EltSize);
- };
// Attempt to sort and vectorize each of the store-groups.
for (auto &Pair : Stores) {
@@ -13945,28 +15113,11 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
continue;
Changed |= tryToVectorizeSequence<StoreInst>(
- Pair.second, Limit, StoreSorter, AreCompatibleStores,
+ Pair.second, StoreSorter, AreCompatibleStores,
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
return vectorizeStores(Candidates, R);
},
- /*LimitForRegisterSize=*/false);
+ /*MaxVFOnly=*/false, R);
}
return Changed;
}
-
-char SLPVectorizer::ID = 0;
-
-static const char lv_name[] = "SLP Vectorizer";
-
-INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
-
-Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 733d2e1c667b..1271d1424c03 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -95,7 +95,7 @@ class VPRecipeBuilder {
/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
/// decision from \p Range.Start to \p Range.End.
VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
- VFRange &Range) const;
+ VFRange &Range, VPlanPtr &Plan);
/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
/// if it can. The function should only be called if the cost-model indicates
@@ -136,11 +136,11 @@ public:
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+ VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);
/// Mark given ingredient for recording its recipe once one is created for
/// it.
@@ -159,19 +159,11 @@ public:
return Ingredient2Recipe[I];
}
- /// Create a replicating region for \p PredRecipe.
- VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
- VPlanPtr &Plan);
-
- /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
- /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
- /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
- /// Region. Update the packing decision of predicated instructions if they
- /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
- /// \p Range.Start to \p Range.End.
- VPBasicBlock *handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan);
+ /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
+ /// last operand. Range.End may be decreased to ensure same recipe behavior
+ /// from \p Range.Start to \p Range.End.
+ VPRecipeOrVPValueTy handleReplication(Instruction *I, VFRange &Range,
+ VPlan &Plan);
/// Add the incoming values from the backedge to reduction & first-order
/// recurrence cross-iteration phis.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index d554f438c804..e81b88fd8099 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/BasicBlock.h"
@@ -46,7 +47,10 @@
#include <vector>
using namespace llvm;
+
+namespace llvm {
extern cl::opt<bool> EnableVPlanNativePath;
+}
#define DEBUG_TYPE "vplan"
@@ -160,8 +164,9 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
}
void VPBlockBase::setPlan(VPlan *ParentPlan) {
- assert(ParentPlan->getEntry() == this &&
- "Can only set plan on its entry block.");
+ assert(
+ (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
+ "Can only set plan on its entry or preheader block.");
Plan = ParentPlan;
}
@@ -209,7 +214,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
}
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
- if (!Def->hasDefiningRecipe())
+ if (Def->isLiveIn())
return Def->getLiveInIRValue();
if (hasScalarValue(Def, Instance)) {
@@ -243,11 +248,19 @@ void VPTransformState::addNewMetadata(Instruction *To,
}
void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
+ // No source instruction to transfer metadata from?
+ if (!From)
+ return;
+
propagateMetadata(To, From);
addNewMetadata(To, From);
}
void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
+ // No source instruction to transfer metadata from?
+ if (!From)
+ return;
+
for (Value *V : To) {
if (Instruction *I = dyn_cast<Instruction>(V))
addMetadata(I, From);
@@ -265,7 +278,7 @@ void VPTransformState::setDebugLocFromInst(const Value *V) {
// When a FSDiscriminator is enabled, we don't need to add the multiply
// factors to the discriminators.
if (DIL && Inst->getFunction()->shouldEmitDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
+ !Inst->isDebugOrPseudoInst() && !EnableFSDiscriminator) {
// FIXME: For scalable vectors, assume vscale=1.
auto NewDIL =
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
@@ -577,7 +590,9 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
#endif
VPlan::~VPlan() {
- clearLiveOuts();
+ for (auto &KV : LiveOuts)
+ delete KV.second;
+ LiveOuts.clear();
if (Entry) {
VPValue DummyValue;
@@ -585,15 +600,23 @@ VPlan::~VPlan() {
Block->dropAllReferences(&DummyValue);
VPBlockBase::deleteCFG(Entry);
+
+ Preheader->dropAllReferences(&DummyValue);
+ delete Preheader;
}
- for (VPValue *VPV : VPValuesToFree)
+ for (VPValue *VPV : VPLiveInsToFree)
delete VPV;
- if (TripCount)
- delete TripCount;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
- for (auto &P : VPExternalDefs)
- delete P.second;
+}
+
+VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
+ VPBasicBlock *Preheader = new VPBasicBlock("ph");
+ VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
+ auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
+ Plan->TripCount =
+ vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
+ return Plan;
}
VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
@@ -609,13 +632,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State,
bool IsEpilogueVectorization) {
-
- // Check if the trip count is needed, and if so build it.
- if (TripCount && TripCount->getNumUsers()) {
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
- State.set(TripCount, TripCountV, Part);
- }
-
// Check if the backedge taken count is needed, and if so build it.
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
@@ -636,7 +652,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
// needs to be changed from zero to the value after the main vector loop.
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
if (CanonicalIVStartValue) {
- VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
+ VPValue *VPV = getVPValueOrAddLiveIn(CanonicalIVStartValue);
auto *IV = getCanonicalIV();
assert(all_of(IV->users(),
[](const VPUser *U) {
@@ -650,8 +666,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
VPInstruction::CanonicalIVIncrementNUW;
}) &&
"the canonical IV should only be used by its increments or "
- "ScalarIVSteps when "
- "resetting the start value");
+ "ScalarIVSteps when resetting the start value");
IV->setOperand(0, VPV);
}
}
@@ -748,13 +763,25 @@ void VPlan::print(raw_ostream &O) const {
if (VectorTripCount.getNumUsers() > 0) {
O << "\nLive-in ";
VectorTripCount.printAsOperand(O, SlotTracker);
- O << " = vector-trip-count\n";
+ O << " = vector-trip-count";
}
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
O << "\nLive-in ";
BackedgeTakenCount->printAsOperand(O, SlotTracker);
- O << " = backedge-taken count\n";
+ O << " = backedge-taken count";
+ }
+
+ O << "\n";
+ if (TripCount->isLiveIn())
+ O << "Live-in ";
+ TripCount->printAsOperand(O, SlotTracker);
+ O << " = original trip-count";
+ O << "\n";
+
+ if (!getPreheader()->empty()) {
+ O << "\n";
+ getPreheader()->print(O, "", SlotTracker);
}
for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) {
@@ -765,11 +792,7 @@ void VPlan::print(raw_ostream &O) const {
if (!LiveOuts.empty())
O << "\n";
for (const auto &KV : LiveOuts) {
- O << "Live-out ";
- KV.second->getPhi()->printAsOperand(O);
- O << " = ";
- KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
- O << "\n";
+ KV.second->print(O, SlotTracker);
}
O << "}\n";
@@ -882,6 +905,8 @@ void VPlanPrinter::dump() {
OS << "edge [fontname=Courier, fontsize=30]\n";
OS << "compound=true\n";
+ dumpBlock(Plan.getPreheader());
+
for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
dumpBlock(Block);
@@ -1086,26 +1111,27 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
}
void VPSlotTracker::assignSlot(const VPValue *V) {
- assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
+ assert(!Slots.contains(V) && "VPValue already has a slot!");
Slots[V] = NextSlot++;
}
void VPSlotTracker::assignSlots(const VPlan &Plan) {
-
- for (const auto &P : Plan.VPExternalDefs)
- assignSlot(P.second);
-
assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
assignSlot(Plan.BackedgeTakenCount);
+ assignSlots(Plan.getPreheader());
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
for (const VPBasicBlock *VPBB :
VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
- for (const VPRecipeBase &Recipe : *VPBB)
- for (VPValue *Def : Recipe.definedValues())
- assignSlot(Def);
+ assignSlots(VPBB);
+}
+
+void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+ for (const VPRecipeBase &Recipe : *VPBB)
+ for (VPValue *Def : Recipe.definedValues())
+ assignSlot(Def);
}
bool vputils::onlyFirstLaneUsed(VPValue *Def) {
@@ -1115,13 +1141,17 @@ bool vputils::onlyFirstLaneUsed(VPValue *Def) {
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
ScalarEvolution &SE) {
+ if (auto *Expanded = Plan.getSCEVExpansion(Expr))
+ return Expanded;
+ VPValue *Expanded = nullptr;
if (auto *E = dyn_cast<SCEVConstant>(Expr))
- return Plan.getOrAddExternalDef(E->getValue());
- if (auto *E = dyn_cast<SCEVUnknown>(Expr))
- return Plan.getOrAddExternalDef(E->getValue());
-
- VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
- VPExpandSCEVRecipe *Step = new VPExpandSCEVRecipe(Expr, SE);
- Preheader->appendRecipe(Step);
- return Step;
+ Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
+ else if (auto *E = dyn_cast<SCEVUnknown>(Expr))
+ Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
+ else {
+ Expanded = new VPExpandSCEVRecipe(Expr, SE);
+ Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
+ }
+ Plan.addSCEVExpansion(Expr, Expanded);
+ return Expanded;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 986faaf99664..73313465adea 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -25,7 +25,6 @@
#include "VPlanValue.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -33,11 +32,12 @@
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/ilist.h"
#include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/FMF.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/IR/Operator.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -47,11 +47,9 @@ namespace llvm {
class BasicBlock;
class DominatorTree;
-class InductionDescriptor;
class InnerLoopVectorizer;
class IRBuilderBase;
class LoopInfo;
-class PredicateScalarEvolution;
class raw_ostream;
class RecurrenceDescriptor;
class SCEV;
@@ -62,6 +60,7 @@ class VPlan;
class VPReplicateRecipe;
class VPlanSlp;
class Value;
+class LoopVersioning;
namespace Intrinsic {
typedef unsigned ID;
@@ -76,16 +75,17 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step);
-const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE);
+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
+ Loop *CurLoop = nullptr);
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
-/// [1, 9) = {1, 2, 4, 8}
+/// [1, 16) = {1, 2, 4, 8}
struct VFRange {
// A power of 2.
const ElementCount Start;
- // Need not be a power of 2. If End <= Start range is empty.
+ // A power of 2. If End <= Start range is empty.
ElementCount End;
bool isEmpty() const {
@@ -98,6 +98,33 @@ struct VFRange {
"Both Start and End should have the same scalable flag");
assert(isPowerOf2_32(Start.getKnownMinValue()) &&
"Expected Start to be a power of 2");
+ assert(isPowerOf2_32(End.getKnownMinValue()) &&
+ "Expected End to be a power of 2");
+ }
+
+ /// Iterator to iterate over vectorization factors in a VFRange.
+ class iterator
+ : public iterator_facade_base<iterator, std::forward_iterator_tag,
+ ElementCount> {
+ ElementCount VF;
+
+ public:
+ iterator(ElementCount VF) : VF(VF) {}
+
+ bool operator==(const iterator &Other) const { return VF == Other.VF; }
+
+ ElementCount operator*() const { return VF; }
+
+ iterator &operator++() {
+ VF *= 2;
+ return *this;
+ }
+ };
+
+ iterator begin() { return iterator(Start); }
+ iterator end() {
+ assert(isPowerOf2_32(End.getKnownMinValue()));
+ return iterator(End);
}
};
@@ -248,7 +275,7 @@ struct VPTransformState {
}
bool hasAnyVectorValue(VPValue *Def) const {
- return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end();
+ return Data.PerPartOutput.contains(Def);
}
bool hasScalarValue(VPValue *Def, VPIteration Instance) {
@@ -370,10 +397,6 @@ struct VPTransformState {
/// Pointer to the VPlan code is generated for.
VPlan *Plan;
- /// Holds recipes that may generate a poison value that is used after
- /// vectorization, even when their operands are not poison.
- SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
-
/// The loop object for the current parent region, or nullptr.
Loop *CurrentVectorLoop = nullptr;
@@ -382,7 +405,11 @@ struct VPTransformState {
///
/// This is currently only used to add no-alias metadata based on the
/// memchecks. The actually versioning is performed manually.
- std::unique_ptr<LoopVersioning> LVer;
+ LoopVersioning *LVer = nullptr;
+
+ /// Map SCEVs to their expanded values. Populated when executing
+ /// VPExpandSCEVRecipes.
+ DenseMap<const SCEV *, Value *> ExpandedSCEVs;
};
/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -639,6 +666,10 @@ public:
VPLiveOut(PHINode *Phi, VPValue *Op)
: VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+ static inline bool classof(const VPUser *U) {
+ return U->getVPUserID() == VPUser::VPUserID::LiveOut;
+ }
+
/// Fixup the wrapped LCSSA phi node in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
@@ -654,6 +685,11 @@ public:
}
PHINode *getPhi() const { return Phi; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the VPLiveOut to \p O.
+ void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+#endif
};
/// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -790,6 +826,7 @@ public:
SLPLoad,
SLPStore,
ActiveLaneMask,
+ CalculateTripCountMinusVF,
CanonicalIVIncrement,
CanonicalIVIncrementNUW,
// The next two are similar to the above, but instead increment the
@@ -810,8 +847,10 @@ private:
const std::string Name;
/// Utility method serving execute(): generates a single instance of the
- /// modeled instruction.
- void generateInstruction(VPTransformState &State, unsigned Part);
+ /// modeled instruction. \returns the generated value for \p Part.
+ /// In some cases an existing value is returned rather than a generated
+ /// one.
+ Value *generateInstruction(VPTransformState &State, unsigned Part);
protected:
void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
@@ -892,6 +931,7 @@ public:
default:
return false;
case VPInstruction::ActiveLaneMask:
+ case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrement:
case VPInstruction::CanonicalIVIncrementNUW:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -903,14 +943,169 @@ public:
}
};
+/// Class to record LLVM IR flag for a recipe along with it.
+class VPRecipeWithIRFlags : public VPRecipeBase {
+ enum class OperationType : unsigned char {
+ OverflowingBinOp,
+ PossiblyExactOp,
+ GEPOp,
+ FPMathOp,
+ Other
+ };
+ struct WrapFlagsTy {
+ char HasNUW : 1;
+ char HasNSW : 1;
+ };
+ struct ExactFlagsTy {
+ char IsExact : 1;
+ };
+ struct GEPFlagsTy {
+ char IsInBounds : 1;
+ };
+ struct FastMathFlagsTy {
+ char AllowReassoc : 1;
+ char NoNaNs : 1;
+ char NoInfs : 1;
+ char NoSignedZeros : 1;
+ char AllowReciprocal : 1;
+ char AllowContract : 1;
+ char ApproxFunc : 1;
+ };
+
+ OperationType OpType;
+
+ union {
+ WrapFlagsTy WrapFlags;
+ ExactFlagsTy ExactFlags;
+ GEPFlagsTy GEPFlags;
+ FastMathFlagsTy FMFs;
+ unsigned char AllFlags;
+ };
+
+public:
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands)
+ : VPRecipeBase(SC, Operands) {
+ OpType = OperationType::Other;
+ AllFlags = 0;
+ }
+
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands,
+ Instruction &I)
+ : VPRecipeWithIRFlags(SC, Operands) {
+ if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
+ OpType = OperationType::OverflowingBinOp;
+ WrapFlags.HasNUW = Op->hasNoUnsignedWrap();
+ WrapFlags.HasNSW = Op->hasNoSignedWrap();
+ } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
+ OpType = OperationType::PossiblyExactOp;
+ ExactFlags.IsExact = Op->isExact();
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ OpType = OperationType::GEPOp;
+ GEPFlags.IsInBounds = GEP->isInBounds();
+ } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
+ OpType = OperationType::FPMathOp;
+ FastMathFlags FMF = Op->getFastMathFlags();
+ FMFs.AllowReassoc = FMF.allowReassoc();
+ FMFs.NoNaNs = FMF.noNaNs();
+ FMFs.NoInfs = FMF.noInfs();
+ FMFs.NoSignedZeros = FMF.noSignedZeros();
+ FMFs.AllowReciprocal = FMF.allowReciprocal();
+ FMFs.AllowContract = FMF.allowContract();
+ FMFs.ApproxFunc = FMF.approxFunc();
+ }
+ }
+
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
+ R->getVPDefID() == VPRecipeBase::VPReplicateSC;
+ }
+
+ /// Drop all poison-generating flags.
+ void dropPoisonGeneratingFlags() {
+ // NOTE: This needs to be kept in-sync with
+ // Instruction::dropPoisonGeneratingFlags.
+ switch (OpType) {
+ case OperationType::OverflowingBinOp:
+ WrapFlags.HasNUW = false;
+ WrapFlags.HasNSW = false;
+ break;
+ case OperationType::PossiblyExactOp:
+ ExactFlags.IsExact = false;
+ break;
+ case OperationType::GEPOp:
+ GEPFlags.IsInBounds = false;
+ break;
+ case OperationType::FPMathOp:
+ FMFs.NoNaNs = false;
+ FMFs.NoInfs = false;
+ break;
+ case OperationType::Other:
+ break;
+ }
+ }
+
+ /// Set the IR flags for \p I.
+ void setFlags(Instruction *I) const {
+ switch (OpType) {
+ case OperationType::OverflowingBinOp:
+ I->setHasNoUnsignedWrap(WrapFlags.HasNUW);
+ I->setHasNoSignedWrap(WrapFlags.HasNSW);
+ break;
+ case OperationType::PossiblyExactOp:
+ I->setIsExact(ExactFlags.IsExact);
+ break;
+ case OperationType::GEPOp:
+ cast<GetElementPtrInst>(I)->setIsInBounds(GEPFlags.IsInBounds);
+ break;
+ case OperationType::FPMathOp:
+ I->setHasAllowReassoc(FMFs.AllowReassoc);
+ I->setHasNoNaNs(FMFs.NoNaNs);
+ I->setHasNoInfs(FMFs.NoInfs);
+ I->setHasNoSignedZeros(FMFs.NoSignedZeros);
+ I->setHasAllowReciprocal(FMFs.AllowReciprocal);
+ I->setHasAllowContract(FMFs.AllowContract);
+ I->setHasApproxFunc(FMFs.ApproxFunc);
+ break;
+ case OperationType::Other:
+ break;
+ }
+ }
+
+ bool isInBounds() const {
+ assert(OpType == OperationType::GEPOp &&
+ "recipe doesn't have inbounds flag");
+ return GEPFlags.IsInBounds;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ FastMathFlags getFastMathFlags() const {
+ FastMathFlags Res;
+ Res.setAllowReassoc(FMFs.AllowReassoc);
+ Res.setNoNaNs(FMFs.NoNaNs);
+ Res.setNoInfs(FMFs.NoInfs);
+ Res.setNoSignedZeros(FMFs.NoSignedZeros);
+ Res.setAllowReciprocal(FMFs.AllowReciprocal);
+ Res.setAllowContract(FMFs.AllowContract);
+ Res.setApproxFunc(FMFs.ApproxFunc);
+ return Res;
+ }
+
+ void printFlags(raw_ostream &O) const;
+#endif
+};
+
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeBase, public VPValue {
+class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeBase(VPDef::VPWidenSC, Operands), VPValue(this, &I) {}
+ : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I) {}
~VPWidenRecipe() override = default;
@@ -926,18 +1121,62 @@ public:
#endif
};
+/// VPWidenCastRecipe is a recipe to create vector cast instructions.
+class VPWidenCastRecipe : public VPRecipeBase, public VPValue {
+ /// Cast instruction opcode.
+ Instruction::CastOps Opcode;
+
+ /// Result type for the cast.
+ Type *ResultTy;
+
+public:
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ CastInst *UI = nullptr)
+ : VPRecipeBase(VPDef::VPWidenCastSC, Op), VPValue(this, UI),
+ Opcode(Opcode), ResultTy(ResultTy) {
+ assert((!UI || UI->getOpcode() == Opcode) &&
+ "opcode of underlying cast doesn't match");
+ assert((!UI || UI->getType() == ResultTy) &&
+ "result type of underlying cast doesn't match");
+ }
+
+ ~VPWidenCastRecipe() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
+
+ /// Produce widened copies of the cast.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Instruction::CastOps getOpcode() const { return Opcode; }
+
+ /// Returns the result type of the cast.
+ Type *getResultType() const { return ResultTy; }
+};
+
/// A recipe for widening Call instructions.
class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
/// ID of the vector intrinsic to call when widening the call. If set the
/// Intrinsic::not_intrinsic, a library call will be used instead.
Intrinsic::ID VectorIntrinsicID;
+ /// If this recipe represents a library call, Variant stores a pointer to
+ /// the chosen function. There is a 1:1 mapping between a given VF and the
+ /// chosen vectorized variant, so there will be a different vplan for each
+ /// VF with a valid variant.
+ Function *Variant;
public:
template <typename IterT>
VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
- Intrinsic::ID VectorIntrinsicID)
+ Intrinsic::ID VectorIntrinsicID,
+ Function *Variant = nullptr)
: VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I),
- VectorIntrinsicID(VectorIntrinsicID) {}
+ VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}
~VPWidenCallRecipe() override = default;
@@ -954,17 +1193,10 @@ public:
};
/// A recipe for widening select instructions.
-class VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
-
- /// Is the condition of the select loop invariant?
- bool InvariantCond;
-
-public:
+struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
template <typename IterT>
- VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
- bool InvariantCond)
- : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I),
- InvariantCond(InvariantCond) {}
+ VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
+ : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I) {}
~VPWidenSelectRecipe() override = default;
@@ -978,29 +1210,38 @@ public:
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ VPValue *getCond() const {
+ return getOperand(0);
+ }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideVectorRegions();
+ }
};
/// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
- bool IsPtrLoopInvariant;
- SmallBitVector IsIndexLoopInvariant;
+class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
+ bool isPointerLoopInvariant() const {
+ return getOperand(0)->isDefinedOutsideVectorRegions();
+ }
+
+ bool isIndexLoopInvariant(unsigned I) const {
+ return getOperand(I + 1)->isDefinedOutsideVectorRegions();
+ }
+
+ bool areAllOperandsInvariant() const {
+ return all_of(operands(), [](VPValue *Op) {
+ return Op->isDefinedOutsideVectorRegions();
+ });
+ }
public:
template <typename IterT>
VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
- : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP),
- IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
+ : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
+ VPValue(this, GEP) {}
- template <typename IterT>
- VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
- Loop *OrigLoop)
- : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP),
- IsIndexLoopInvariant(GEP->getNumIndices(), false) {
- IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
- for (auto Index : enumerate(GEP->indices()))
- IsIndexLoopInvariant[Index.index()] =
- OrigLoop->isLoopInvariant(Index.value().get());
- }
~VPWidenGEPRecipe() override = default;
VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
@@ -1015,78 +1256,6 @@ public:
#endif
};
-/// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
- PHINode *IV;
- const InductionDescriptor &IndDesc;
- bool NeedsVectorIV;
-
-public:
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
- const InductionDescriptor &IndDesc,
- bool NeedsVectorIV)
- : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
- VPValue(this, IV), IV(IV), IndDesc(IndDesc),
- NeedsVectorIV(NeedsVectorIV) {}
-
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
- const InductionDescriptor &IndDesc,
- TruncInst *Trunc, bool NeedsVectorIV)
- : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
- VPValue(this, Trunc), IV(IV), IndDesc(IndDesc),
- NeedsVectorIV(NeedsVectorIV) {}
-
- ~VPWidenIntOrFpInductionRecipe() override = default;
-
- VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
-
- /// Generate the vectorized and scalarized versions of the phi node as
- /// needed by their users.
- void execute(VPTransformState &State) override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- /// Returns the start value of the induction.
- VPValue *getStartValue() { return getOperand(0); }
- const VPValue *getStartValue() const { return getOperand(0); }
-
- /// Returns the step value of the induction.
- VPValue *getStepValue() { return getOperand(1); }
- const VPValue *getStepValue() const { return getOperand(1); }
-
- /// Returns the first defined value as TruncInst, if it is one or nullptr
- /// otherwise.
- TruncInst *getTruncInst() {
- return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
- }
- const TruncInst *getTruncInst() const {
- return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
- }
-
- PHINode *getPHINode() { return IV; }
-
- /// Returns the induction descriptor for the recipe.
- const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-
- /// Returns true if the induction is canonical, i.e. starting at 0 and
- /// incremented by UF * VF (= the original IV is incremented by 1).
- bool isCanonical() const;
-
- /// Returns the scalar type of the induction.
- const Type *getScalarType() const {
- const TruncInst *TruncI = getTruncInst();
- return TruncI ? TruncI->getType() : IV->getType();
- }
-
- /// Returns true if a vector phi needs to be created for the induction.
- bool needsVectorIV() const { return NeedsVectorIV; }
-};
-
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
@@ -1112,9 +1281,9 @@ public:
/// per-lane based on the canonical induction.
class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
protected:
- VPHeaderPHIRecipe(unsigned char VPDefID, PHINode *Phi,
+ VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
VPValue *Start = nullptr)
- : VPRecipeBase(VPDefID, {}), VPValue(this, Phi) {
+ : VPRecipeBase(VPDefID, {}), VPValue(this, UnderlyingInstr) {
if (Start)
addOperand(Start);
}
@@ -1125,12 +1294,12 @@ public:
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPRecipeBase *B) {
return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC &&
- B->getVPDefID() <= VPDef::VPLastPHISC;
+ B->getVPDefID() <= VPDef::VPLastHeaderPHISC;
}
static inline bool classof(const VPValue *V) {
auto *B = V->getDefiningRecipe();
return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
- B->getVPDefID() <= VPRecipeBase::VPLastPHISC;
+ B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC;
}
/// Generate the phi nodes.
@@ -1154,17 +1323,92 @@ public:
void setStartValue(VPValue *V) { setOperand(0, V); }
/// Returns the incoming value from the loop backedge.
- VPValue *getBackedgeValue() {
+ virtual VPValue *getBackedgeValue() {
return getOperand(1);
}
/// Returns the backedge value as a recipe. The backedge value is guaranteed
/// to be a recipe.
- VPRecipeBase &getBackedgeRecipe() {
+ virtual VPRecipeBase &getBackedgeRecipe() {
return *getBackedgeValue()->getDefiningRecipe();
}
};
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector values.
+class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
+ PHINode *IV;
+ TruncInst *Trunc;
+ const InductionDescriptor &IndDesc;
+
+public:
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
+ const InductionDescriptor &IndDesc)
+ : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
+ Trunc(nullptr), IndDesc(IndDesc) {
+ addOperand(Step);
+ }
+
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
+ const InductionDescriptor &IndDesc,
+ TruncInst *Trunc)
+ : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
+ IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
+ addOperand(Step);
+ }
+
+ ~VPWidenIntOrFpInductionRecipe() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
+
+ /// Generate the vectorized and scalarized versions of the phi node as
+ /// needed by their users.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ VPValue *getBackedgeValue() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+
+ VPRecipeBase &getBackedgeRecipe() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+
+ /// Returns the step value of the induction.
+ VPValue *getStepValue() { return getOperand(1); }
+ const VPValue *getStepValue() const { return getOperand(1); }
+
+ /// Returns the first defined value as TruncInst, if it is one or nullptr
+ /// otherwise.
+ TruncInst *getTruncInst() { return Trunc; }
+ const TruncInst *getTruncInst() const { return Trunc; }
+
+ PHINode *getPHINode() { return IV; }
+
+ /// Returns the induction descriptor for the recipe.
+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
+
+ /// Returns true if the induction is canonical, i.e. starting at 0 and
+ /// incremented by UF * VF (= the original IV is incremented by 1).
+ bool isCanonical() const;
+
+ /// Returns the scalar type of the induction.
+ const Type *getScalarType() const {
+ return Trunc ? Trunc->getType() : IV->getType();
+ }
+};
+
class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
const InductionDescriptor &IndDesc;
@@ -1374,12 +1618,20 @@ public:
class VPInterleaveRecipe : public VPRecipeBase {
const InterleaveGroup<Instruction> *IG;
+ /// Indicates if the interleave group is in a conditional block and requires a
+ /// mask.
bool HasMask = false;
+ /// Indicates if gaps between members of the group need to be masked out or if
+ /// unusued gaps can be loaded speculatively.
+ bool NeedsMaskForGaps = false;
+
public:
VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues, VPValue *Mask)
- : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG) {
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps)
+ : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG),
+ NeedsMaskForGaps(NeedsMaskForGaps) {
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *I = IG->getMember(i)) {
if (I->getType()->isVoidTy())
@@ -1490,28 +1742,21 @@ public:
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
/// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeBase, public VPValue {
+class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue {
/// Indicator if only a single replica per lane is needed.
bool IsUniform;
/// Indicator if the replicas are also predicated.
bool IsPredicated;
- /// Indicator if the scalar values should also be packed into a vector.
- bool AlsoPack;
-
public:
template <typename IterT>
VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
- bool IsUniform, bool IsPredicated = false)
- : VPRecipeBase(VPDef::VPReplicateSC, Operands), VPValue(this, I),
- IsUniform(IsUniform), IsPredicated(IsPredicated) {
- // Retain the previous behavior of predicateInstructions(), where an
- // insert-element of a predicated instruction got hoisted into the
- // predicated basic block iff it was its only user. This is achieved by
- // having predicated instructions also pack their values into a vector by
- // default unless they have a replicated user which uses their scalar value.
- AlsoPack = IsPredicated && !I->use_empty();
+ bool IsUniform, VPValue *Mask = nullptr)
+ : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
+ VPValue(this, I), IsUniform(IsUniform), IsPredicated(Mask) {
+ if (Mask)
+ addOperand(Mask);
}
~VPReplicateRecipe() override = default;
@@ -1523,8 +1768,6 @@ public:
/// the \p State.
void execute(VPTransformState &State) override;
- void setAlsoPack(bool Pack) { AlsoPack = Pack; }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -1533,8 +1776,6 @@ public:
bool isUniform() const { return IsUniform; }
- bool isPacked() const { return AlsoPack; }
-
bool isPredicated() const { return IsPredicated; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
@@ -1550,6 +1791,17 @@ public:
"Op must be an operand of the recipe");
return true;
}
+
+ /// Returns true if the recipe is used by a widened recipe via an intervening
+ /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed
+ /// in a vector.
+ bool shouldPack() const;
+
+ /// Return the mask of a predicated VPReplicateRecipe.
+ VPValue *getMask() {
+ assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
+ return getOperand(getNumOperands() - 1);
+ }
};
/// A recipe for generating conditional branches on the bits of a mask.
@@ -1791,9 +2043,11 @@ public:
return true;
}
- /// Check if the induction described by \p ID is canonical, i.e. has the same
- /// start, step (of 1), and type as the canonical IV.
- bool isCanonical(const InductionDescriptor &ID, Type *Ty) const;
+ /// Check if the induction described by \p Kind, /p Start and \p Step is
+ /// canonical, i.e. has the same start, step (of 1), and type as the
+ /// canonical IV.
+ bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start,
+ VPValue *Step, Type *Ty) const;
};
/// A recipe for generating the active lane mask for the vector loop that is
@@ -2156,13 +2410,19 @@ public:
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
-/// VPBlock.
+/// VPBasicBlock.
class VPlan {
friend class VPlanPrinter;
friend class VPSlotTracker;
- /// Hold the single entry to the Hierarchical CFG of the VPlan.
- VPBlockBase *Entry;
+ /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
+ /// preheader of the vector loop.
+ VPBasicBlock *Entry;
+
+ /// VPBasicBlock corresponding to the original preheader. Used to place
+ /// VPExpandSCEV recipes for expressions used during skeleton creation and the
+ /// rest of VPlan execution.
+ VPBasicBlock *Preheader;
/// Holds the VFs applicable to this VPlan.
SmallSetVector<ElementCount, 2> VFs;
@@ -2174,10 +2434,6 @@ class VPlan {
/// Holds the name of the VPlan, for printing.
std::string Name;
- /// Holds all the external definitions created for this VPlan. External
- /// definitions must be immutable and hold a pointer to their underlying IR.
- DenseMap<Value *, VPValue *> VPExternalDefs;
-
/// Represents the trip count of the original loop, for folding
/// the tail.
VPValue *TripCount = nullptr;
@@ -2193,9 +2449,9 @@ class VPlan {
/// VPlan.
Value2VPValueTy Value2VPValue;
- /// Contains all VPValues that been allocated by addVPValue directly and need
- /// to be free when the plan's destructor is called.
- SmallVector<VPValue *, 16> VPValuesToFree;
+ /// Contains all the external definitions created for this VPlan. External
+ /// definitions are VPValues that hold a pointer to their underlying IR.
+ SmallVector<VPValue *, 16> VPLiveInsToFree;
/// Indicates whether it is safe use the Value2VPValue mapping or if the
/// mapping cannot be used any longer, because it is stale.
@@ -2204,14 +2460,41 @@ class VPlan {
/// Values used outside the plan.
MapVector<PHINode *, VPLiveOut *> LiveOuts;
+ /// Mapping from SCEVs to the VPValues representing their expansions.
+ /// NOTE: This mapping is temporary and will be removed once all users have
+ /// been modeled in VPlan directly.
+ DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
+
public:
- VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
- if (Entry)
- Entry->setPlan(this);
+ /// Construct a VPlan with original preheader \p Preheader, trip count \p TC
+ /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to
+ /// be disconnected, as the bypass blocks between them are not yet modeled in
+ /// VPlan.
+ VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry)
+ : VPlan(Preheader, Entry) {
+ TripCount = TC;
+ }
+
+ /// Construct a VPlan with original preheader \p Preheader and \p Entry to
+ /// the plan. At the moment, \p Preheader and \p Entry need to be
+ /// disconnected, as the bypass blocks between them are not yet modeled in
+ /// VPlan.
+ VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry)
+ : Entry(Entry), Preheader(Preheader) {
+ Entry->setPlan(this);
+ Preheader->setPlan(this);
+ assert(Preheader->getNumSuccessors() == 0 &&
+ Preheader->getNumPredecessors() == 0 &&
+ "preheader must be disconnected");
}
~VPlan();
+ /// Create an initial VPlan with preheader and entry blocks. Creates a
+ /// VPExpandSCEVRecipe for \p TripCount and uses it as plan's trip count.
+ static VPlanPtr createInitialVPlan(const SCEV *TripCount,
+ ScalarEvolution &PSE);
+
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
Value *CanonicalIVStartValue, VPTransformState &State,
@@ -2220,19 +2503,12 @@ public:
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
- VPBlockBase *getEntry() { return Entry; }
- const VPBlockBase *getEntry() const { return Entry; }
-
- VPBlockBase *setEntry(VPBlockBase *Block) {
- Entry = Block;
- Block->setPlan(this);
- return Entry;
- }
+ VPBasicBlock *getEntry() { return Entry; }
+ const VPBasicBlock *getEntry() const { return Entry; }
/// The trip count of the original loop.
- VPValue *getOrCreateTripCount() {
- if (!TripCount)
- TripCount = new VPValue();
+ VPValue *getTripCount() const {
+ assert(TripCount && "trip count needs to be set before accessing it");
return TripCount;
}
@@ -2275,50 +2551,35 @@ public:
void setName(const Twine &newName) { Name = newName.str(); }
- /// Get the existing or add a new external definition for \p V.
- VPValue *getOrAddExternalDef(Value *V) {
- auto I = VPExternalDefs.insert({V, nullptr});
- if (I.second)
- I.first->second = new VPValue(V);
- return I.first->second;
- }
-
- void addVPValue(Value *V) {
- assert(Value2VPValueEnabled &&
- "IR value to VPValue mapping may be out of date!");
- assert(V && "Trying to add a null Value to VPlan");
- assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
- VPValue *VPV = new VPValue(V);
- Value2VPValue[V] = VPV;
- VPValuesToFree.push_back(VPV);
- }
-
void addVPValue(Value *V, VPValue *VPV) {
- assert(Value2VPValueEnabled && "Value2VPValue mapping may be out of date!");
+ assert((Value2VPValueEnabled || VPV->isLiveIn()) &&
+ "Value2VPValue mapping may be out of date!");
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
Value2VPValue[V] = VPV;
}
/// Returns the VPValue for \p V. \p OverrideAllowed can be used to disable
- /// checking whether it is safe to query VPValues using IR Values.
+ /// /// checking whether it is safe to query VPValues using IR Values.
VPValue *getVPValue(Value *V, bool OverrideAllowed = false) {
- assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
- "Value2VPValue mapping may be out of date!");
assert(V && "Trying to get the VPValue of a null Value");
assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
+ assert((Value2VPValueEnabled || OverrideAllowed ||
+ Value2VPValue[V]->isLiveIn()) &&
+ "Value2VPValue mapping may be out of date!");
return Value2VPValue[V];
}
- /// Gets the VPValue or adds a new one (if none exists yet) for \p V. \p
- /// OverrideAllowed can be used to disable checking whether it is safe to
- /// query VPValues using IR Values.
- VPValue *getOrAddVPValue(Value *V, bool OverrideAllowed = false) {
- assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
- "Value2VPValue mapping may be out of date!");
+ /// Gets the VPValue for \p V or adds a new live-in (if none exists yet) for
+ /// \p V.
+ VPValue *getVPValueOrAddLiveIn(Value *V) {
assert(V && "Trying to get or add the VPValue of a null Value");
- if (!Value2VPValue.count(V))
- addVPValue(V);
+ if (!Value2VPValue.count(V)) {
+ VPValue *VPV = new VPValue(V);
+ VPLiveInsToFree.push_back(VPV);
+ addVPValue(V, VPV);
+ }
+
return getVPValue(V);
}
@@ -2344,7 +2605,7 @@ public:
iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
mapToVPValues(User::op_range Operands) {
std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
- return getOrAddVPValue(Op);
+ return getVPValueOrAddLiveIn(Op);
};
return map_range(Operands, Fn);
}
@@ -2373,12 +2634,6 @@ public:
void addLiveOut(PHINode *PN, VPValue *V);
- void clearLiveOuts() {
- for (auto &KV : LiveOuts)
- delete KV.second;
- LiveOuts.clear();
- }
-
void removeLiveOut(PHINode *PN) {
delete LiveOuts[PN];
LiveOuts.erase(PN);
@@ -2388,6 +2643,19 @@ public:
return LiveOuts;
}
+ VPValue *getSCEVExpansion(const SCEV *S) const {
+ return SCEVToExpansion.lookup(S);
+ }
+
+ void addSCEVExpansion(const SCEV *S, VPValue *V) {
+ assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
+ SCEVToExpansion[S] = V;
+ }
+
+ /// \return The block corresponding to the original preheader.
+ VPBasicBlock *getPreheader() { return Preheader; }
+ const VPBasicBlock *getPreheader() const { return Preheader; }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
@@ -2709,6 +2977,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
assert(Def && "Must have definition for value defined inside vector region");
if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
return Rep->isUniform();
+ if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
+ return all_of(GEP->operands(), isUniformAfterVectorization);
return false;
}
} // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index f790f7e73e11..89e2e7514dac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -13,6 +13,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H
#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/SmallVector.h"
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 952ce72e36c1..f6e3a2a16db8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -73,9 +73,8 @@ public:
PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
: TheLoop(Lp), LI(LI), Plan(P) {}
- /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
- /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
- VPBasicBlock *buildPlainCFG();
+ /// Build plain CFG for TheLoop and connects it to Plan's entry.
+ void buildPlainCFG();
};
} // anonymous namespace
@@ -196,7 +195,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
// A and B: Create VPValue and add it to the pool of external definitions and
// to the Value->VPValue map.
- VPValue *NewVPVal = Plan.getOrAddExternalDef(IRVal);
+ VPValue *NewVPVal = Plan.getVPValueOrAddLiveIn(IRVal);
IRDef2VPValue[IRVal] = NewVPVal;
return NewVPVal;
}
@@ -254,7 +253,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-VPBasicBlock *PlainCFGBuilder::buildPlainCFG() {
+void PlainCFGBuilder::buildPlainCFG() {
// 1. Scan the body of the loop in a topological order to visit each basic
// block after having visited its predecessor basic blocks. Create a VPBB for
// each BB and link it to its successor and predecessor VPBBs. Note that
@@ -267,12 +266,13 @@ VPBasicBlock *PlainCFGBuilder::buildPlainCFG() {
BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
"Unexpected loop preheader");
- VPBasicBlock *ThePreheaderVPBB = getOrCreateVPBB(ThePreheaderBB);
+ VPBasicBlock *ThePreheaderVPBB = Plan.getEntry();
+ BB2VPBB[ThePreheaderBB] = ThePreheaderVPBB;
ThePreheaderVPBB->setName("vector.ph");
for (auto &I : *ThePreheaderBB) {
if (I.getType()->isVoidTy())
continue;
- IRDef2VPValue[&I] = Plan.getOrAddExternalDef(&I);
+ IRDef2VPValue[&I] = Plan.getVPValueOrAddLiveIn(&I);
}
// Create empty VPBB for Loop H so that we can link PH->H.
VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
@@ -371,20 +371,17 @@ VPBasicBlock *PlainCFGBuilder::buildPlainCFG() {
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
// VPlan operands.
fixPhiNodes();
-
- return ThePreheaderVPBB;
}
-VPBasicBlock *VPlanHCFGBuilder::buildPlainCFG() {
+void VPlanHCFGBuilder::buildPlainCFG() {
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
- return PCFGBuilder.buildPlainCFG();
+ PCFGBuilder.buildPlainCFG();
}
// Public interface to build a H-CFG.
void VPlanHCFGBuilder::buildHierarchicalCFG() {
- // Build Top Region enclosing the plain CFG and set it as VPlan entry.
- VPBasicBlock *EntryVPBB = buildPlainCFG();
- Plan.setEntry(EntryVPBB);
+ // Build Top Region enclosing the plain CFG.
+ buildPlainCFG();
LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 2d52990af268..299ae36155cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -57,9 +57,8 @@ private:
// are introduced.
VPDominatorTree VPDomTree;
- /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
- /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
- VPBasicBlock *buildPlainCFG();
+ /// Build plain CFG for TheLoop and connects it to Plan's entry.
+ void buildPlainCFG();
public:
VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4e9be35001ad..26c309eed800 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -34,7 +34,9 @@ using namespace llvm;
using VectorParts = SmallVector<Value *, 2>;
+namespace llvm {
extern cl::opt<bool> EnableVPlanNativePath;
+}
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -50,14 +52,16 @@ bool VPRecipeBase::mayWriteToMemory() const {
->mayWriteToMemory();
case VPBranchOnMaskSC:
case VPScalarIVStepsSC:
+ case VPPredInstPHISC:
return false;
- case VPWidenIntOrFpInductionSC:
+ case VPBlendSC:
+ case VPReductionSC:
case VPWidenCanonicalIVSC:
+ case VPWidenCastSC:
+ case VPWidenGEPSC:
+ case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
- case VPBlendSC:
case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -82,14 +86,16 @@ bool VPRecipeBase::mayReadFromMemory() const {
->mayReadFromMemory();
case VPBranchOnMaskSC:
case VPScalarIVStepsSC:
+ case VPPredInstPHISC:
return false;
- case VPWidenIntOrFpInductionSC:
+ case VPBlendSC:
+ case VPReductionSC:
case VPWidenCanonicalIVSC:
+ case VPWidenCastSC:
+ case VPWidenGEPSC:
+ case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
- case VPBlendSC:
case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -108,16 +114,20 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPPredInstPHISC:
return false;
- case VPWidenIntOrFpInductionSC:
- case VPWidenPointerInductionSC:
+ case VPWidenCallSC:
+ return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
+ ->mayHaveSideEffects();
+ case VPBlendSC:
+ case VPReductionSC:
+ case VPScalarIVStepsSC:
case VPWidenCanonicalIVSC:
+ case VPWidenCastSC:
+ case VPWidenGEPSC:
+ case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
- case VPBlendSC:
+ case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
- case VPWidenSelectSC:
- case VPScalarIVStepsSC: {
+ case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -125,6 +135,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPWidenMemoryInstructionSC:
+ assert(cast<VPWidenMemoryInstructionRecipe>(this)
+ ->getIngredient()
+ .mayHaveSideEffects() == mayWriteToMemory() &&
+ "mayHaveSideffects result for ingredient differs from this "
+ "implementation");
+ return mayWriteToMemory();
case VPReplicateSC: {
auto *R = cast<VPReplicateRecipe>(this);
return R->getUnderlyingInstr()->mayHaveSideEffects();
@@ -143,6 +160,16 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
State.Builder.GetInsertBlock());
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+ O << "Live-out ";
+ getPhi()->printAsOperand(O);
+ O << " = ";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ O << "\n";
+}
+#endif
+
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
assert(!Parent && "Recipe already in some VPBasicBlock");
assert(InsertPos->getParent() &&
@@ -189,55 +216,44 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
-void VPInstruction::generateInstruction(VPTransformState &State,
- unsigned Part) {
+Value *VPInstruction::generateInstruction(VPTransformState &State,
+ unsigned Part) {
IRBuilderBase &Builder = State.Builder;
Builder.SetCurrentDebugLocation(DL);
if (Instruction::isBinaryOp(getOpcode())) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
- Value *V =
- Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
- State.set(this, V, Part);
- return;
+ return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
}
switch (getOpcode()) {
case VPInstruction::Not: {
Value *A = State.get(getOperand(0), Part);
- Value *V = Builder.CreateNot(A, Name);
- State.set(this, V, Part);
- break;
+ return Builder.CreateNot(A, Name);
}
case VPInstruction::ICmpULE: {
Value *IV = State.get(getOperand(0), Part);
Value *TC = State.get(getOperand(1), Part);
- Value *V = Builder.CreateICmpULE(IV, TC, Name);
- State.set(this, V, Part);
- break;
+ return Builder.CreateICmpULE(IV, TC, Name);
}
case Instruction::Select: {
Value *Cond = State.get(getOperand(0), Part);
Value *Op1 = State.get(getOperand(1), Part);
Value *Op2 = State.get(getOperand(2), Part);
- Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);
- State.set(this, V, Part);
- break;
+ return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
- Value *ScalarTC = State.get(getOperand(1), Part);
+ Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);
- Instruction *Call = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
- {VIVElem0, ScalarTC}, nullptr, Name);
- State.set(this, Call, Part);
- break;
+ return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+ {PredTy, ScalarTC->getType()},
+ {VIVElem0, ScalarTC}, nullptr, Name);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
@@ -255,18 +271,22 @@ void VPInstruction::generateInstruction(VPTransformState &State,
// For the first part, use the recurrence phi (v1), otherwise v2.
auto *V1 = State.get(getOperand(0), 0);
Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
- if (!PartMinus1->getType()->isVectorTy()) {
- State.set(this, PartMinus1, Part);
- } else {
- Value *V2 = State.get(getOperand(1), Part);
- State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),
- Part);
- }
- break;
+ if (!PartMinus1->getType()->isVectorTy())
+ return PartMinus1;
+ Value *V2 = State.get(getOperand(1), Part);
+ return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
+ }
+ case VPInstruction::CalculateTripCountMinusVF: {
+ Value *ScalarTC = State.get(getOperand(0), {0, 0});
+ Value *Step =
+ createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
+ Value *Sub = Builder.CreateSub(ScalarTC, Step);
+ Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
+ Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
+ return Builder.CreateSelect(Cmp, Sub, Zero);
}
case VPInstruction::CanonicalIVIncrement:
case VPInstruction::CanonicalIVIncrementNUW: {
- Value *Next = nullptr;
if (Part == 0) {
bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
auto *Phi = State.get(getOperand(0), 0);
@@ -274,34 +294,26 @@ void VPInstruction::generateInstruction(VPTransformState &State,
// elements) times the unroll factor (num of SIMD instructions).
Value *Step =
createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
- Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
- } else {
- Next = State.get(this, 0);
+ return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
}
-
- State.set(this, Next, Part);
- break;
+ return State.get(this, 0);
}
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::CanonicalIVIncrementForPartNUW: {
bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
- if (Part == 0) {
- State.set(this, IV, Part);
- break;
- }
+ if (Part == 0)
+ return IV;
// The canonical IV is incremented by the vectorization factor (num of SIMD
// elements) times the unroll part.
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
- Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);
- State.set(this, Next, Part);
- break;
+ return Builder.CreateAdd(IV, Step, Name, IsNUW, false);
}
case VPInstruction::BranchOnCond: {
if (Part != 0)
- break;
+ return nullptr;
Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
VPRegionBlock *ParentRegion = getParent()->getParent();
@@ -318,11 +330,11 @@ void VPInstruction::generateInstruction(VPTransformState &State,
CondBr->setSuccessor(0, nullptr);
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
- break;
+ return CondBr;
}
case VPInstruction::BranchOnCount: {
if (Part != 0)
- break;
+ return nullptr;
// First create the compare.
Value *IV = State.get(getOperand(0), Part);
Value *TC = State.get(getOperand(1), Part);
@@ -342,7 +354,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.CFG.VPBB2IRBB[Header]);
CondBr->setSuccessor(0, nullptr);
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
- break;
+ return CondBr;
}
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -353,8 +365,13 @@ void VPInstruction::execute(VPTransformState &State) {
assert(!State.Instance && "VPInstruction executing an Instance");
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
State.Builder.setFastMathFlags(FMF);
- for (unsigned Part = 0; Part < State.UF; ++Part)
- generateInstruction(State, Part);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *GeneratedValue = generateInstruction(State, Part);
+ if (!hasResult())
+ continue;
+ assert(GeneratedValue && "generateInstruction must produce a value");
+ State.set(this, GeneratedValue, Part);
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -400,6 +417,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCond:
O << "branch-on-cond";
break;
+ case VPInstruction::CalculateTripCountMinusVF:
+ O << "TC > VF ? TC - VF : 0";
+ break;
case VPInstruction::CanonicalIVIncrementForPart:
O << "VF * Part + ";
break;
@@ -438,18 +458,19 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
}
void VPWidenCallRecipe::execute(VPTransformState &State) {
+ assert(State.VF.isVector() && "not widening");
auto &CI = *cast<CallInst>(getUnderlyingInstr());
assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFromInst(&CI);
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI.args())
- Tys.push_back(
- ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue()));
-
for (unsigned Part = 0; Part < State.UF; ++Part) {
- SmallVector<Type *, 2> TysForDecl = {CI.getType()};
+ SmallVector<Type *, 2> TysForDecl;
+ // Add return type if intrinsic is overloaded on it.
+ if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) {
+ TysForDecl.push_back(
+ VectorType::get(CI.getType()->getScalarType(), State.VF));
+ }
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
@@ -468,21 +489,16 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
Function *VectorF;
if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
// Use vector version of the intrinsic.
- if (State.VF.isVector())
- TysForDecl[0] =
- VectorType::get(CI.getType()->getScalarType(), State.VF);
Module *M = State.Builder.GetInsertBlock()->getModule();
VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
- // Use vector version of the function call.
- const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/);
#ifndef NDEBUG
- assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
- "Can't create vector function.");
+ assert(Variant != nullptr && "Can't create vector function.");
#endif
- VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
+ VectorF = Variant;
}
+
SmallVector<OperandBundleDef, 1> OpBundles;
CI.getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
@@ -514,8 +530,12 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
if (VectorIntrinsicID)
O << " (using vector intrinsic)";
- else
- O << " (using library function)";
+ else {
+ O << " (using library function";
+ if (Variant->hasName())
+ O << ": " << Variant->getName();
+ O << ")";
+ }
}
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -528,7 +548,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(1)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
- O << (InvariantCond ? " (condition is loop invariant)" : "");
+ O << (isInvariantCond() ? " (condition is loop invariant)" : "");
}
#endif
@@ -541,10 +561,10 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
auto *InvarCond =
- InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
+ isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
+ Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
Value *Op0 = State.get(getOperand(1), Part);
Value *Op1 = State.get(getOperand(2), Part);
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
@@ -553,6 +573,33 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
}
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
+ switch (OpType) {
+ case OperationType::PossiblyExactOp:
+ if (ExactFlags.IsExact)
+ O << " exact";
+ break;
+ case OperationType::OverflowingBinOp:
+ if (WrapFlags.HasNUW)
+ O << " nuw";
+ if (WrapFlags.HasNSW)
+ O << " nsw";
+ break;
+ case OperationType::FPMathOp:
+ getFastMathFlags().print(O);
+ break;
+ case OperationType::GEPOp:
+ if (GEPFlags.IsInBounds)
+ O << " inbounds";
+ break;
+ case OperationType::Other:
+ break;
+ }
+ O << " ";
+}
+#endif
+
void VPWidenRecipe::execute(VPTransformState &State) {
auto &I = *cast<Instruction>(getUnderlyingValue());
auto &Builder = State.Builder;
@@ -592,17 +639,8 @@ void VPWidenRecipe::execute(VPTransformState &State) {
Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
- if (auto *VecOp = dyn_cast<Instruction>(V)) {
- VecOp->copyIRFlags(&I);
-
- // If the instruction is vectorized and was in a basic block that needed
- // predication, we can't propagate poison-generating flags (nuw/nsw,
- // exact, etc.). The control flow has been linearized and the
- // instruction is no longer guarded by the predicate, which could make
- // the flag properties to no longer hold.
- if (State.MayGeneratePoisonRecipes.contains(this))
- VecOp->dropPoisonGeneratingFlags();
- }
+ if (auto *VecOp = dyn_cast<Instruction>(V))
+ setFlags(VecOp);
// Use this vector value for all users of the original instruction.
State.set(this, V, Part);
@@ -646,35 +684,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
break;
}
-
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto *CI = cast<CastInst>(&I);
- State.setDebugLocFromInst(CI);
-
- /// Vectorize casts.
- Type *DestTy = (State.VF.isScalar())
- ? CI->getType()
- : VectorType::get(CI->getType(), State.VF);
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *A = State.get(getOperand(0), Part);
- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- State.set(this, Cast, Part);
- State.addMetadata(Cast, &I);
- }
- break;
- }
default:
// This instruction is not vectorized by simple widening.
LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
@@ -687,10 +696,39 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
const Instruction *UI = getUnderlyingInstr();
- O << " = " << UI->getOpcodeName() << " ";
+ O << " = " << UI->getOpcodeName();
+ printFlags(O);
if (auto *Cmp = dyn_cast<CmpInst>(UI))
- O << CmpInst::getPredicateName(Cmp->getPredicate()) << " ";
+ O << Cmp->getPredicate() << " ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenCastRecipe::execute(VPTransformState &State) {
+ auto *I = cast_or_null<Instruction>(getUnderlyingValue());
+ if (I)
+ State.setDebugLocFromInst(I);
+ auto &Builder = State.Builder;
+ /// Vectorize casts.
+ assert(State.VF.isVector() && "Not vectorizing?");
+ Type *DestTy = VectorType::get(getResultType(), State.VF);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
+ State.set(this, Cast, Part);
+ State.addMetadata(Cast, I);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-CAST ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(Opcode) << " ";
printOperands(O, SlotTracker);
+ O << " to " << *getResultType();
}
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -710,8 +748,13 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+ // The step may be defined by a recipe in the preheader (e.g. if it requires
+ // SCEV expansion), but for the canonical induction the step is required to be
+ // 1, which is represented as live-in.
+ if (getStepValue()->getDefiningRecipe())
+ return false;
+ auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
return StartC && StartC->isZero() && StepC && StepC->isOne();
}
@@ -743,6 +786,7 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenGEPRecipe::execute(VPTransformState &State) {
+ assert(State.VF.isVector() && "not widening");
auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
@@ -750,7 +794,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// is vector-typed. Thus, to keep the representation compact, we only use
// vector-typed operands for loop-varying values.
- if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ if (areAllOperandsInvariant()) {
// If we are vectorizing, but the GEP has only loop-invariant operands,
// the GEP we build (by only using vector-typed operands for
// loop-varying values) would be a scalar pointer. Thus, to ensure we
@@ -763,9 +807,15 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// required. We would add the scalarization decision to
// collectLoopScalars() and teach getVectorValue() to broadcast
// the lane-zero scalar value.
- auto *Clone = State.Builder.Insert(GEP->clone());
+ SmallVector<Value *> Ops;
+ for (unsigned I = 0, E = getNumOperands(); I != E; I++)
+ Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
+
+ auto *NewGEP =
+ State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
+ ArrayRef(Ops).drop_front(), "", isInBounds());
for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
State.set(this, EntryPart, Part);
State.addMetadata(EntryPart, GEP);
}
@@ -780,7 +830,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
for (unsigned Part = 0; Part < State.UF; ++Part) {
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant
+ auto *Ptr = isPointerLoopInvariant()
? State.get(getOperand(0), VPIteration(0, 0))
: State.get(getOperand(0), Part);
@@ -789,24 +839,16 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
SmallVector<Value *, 4> Indices;
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
VPValue *Operand = getOperand(I);
- if (IsIndexLoopInvariant[I - 1])
+ if (isIndexLoopInvariant(I - 1))
Indices.push_back(State.get(Operand, VPIteration(0, 0)));
else
Indices.push_back(State.get(Operand, Part));
}
- // If the GEP instruction is vectorized and was in a basic block that
- // needed predication, we can't propagate the poison-generating 'inbounds'
- // flag. The control flow has been linearized and the GEP is no longer
- // guarded by the predicate, which could make the 'inbounds' properties to
- // no longer hold.
- bool IsInBounds =
- GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
-
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
- Indices, "", IsInBounds);
+ Indices, "", isInBounds());
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP, Part);
@@ -819,14 +861,14 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-GEP ";
- O << (IsPtrLoopInvariant ? "Inv" : "Var");
- size_t IndicesNumber = IsIndexLoopInvariant.size();
- for (size_t I = 0; I < IndicesNumber; ++I)
- O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+ O << (isPointerLoopInvariant() ? "Inv" : "Var");
+ for (size_t I = 0; I < getNumOperands() - 1; ++I)
+ O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
O << " ";
printAsOperand(O, SlotTracker);
- O << " = getelementptr ";
+ O << " = getelementptr";
+ printFlags(O);
printOperands(O, SlotTracker);
}
#endif
@@ -911,7 +953,21 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " (with final reduction value stored in invariant address sank "
"outside of loop)";
}
+#endif
+
+bool VPReplicateRecipe::shouldPack() const {
+ // Find if the recipe is used by a widened recipe via an intervening
+ // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
+ return any_of(users(), [](const VPUser *U) {
+ if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
+ return any_of(PredR->users(), [PredR](const VPUser *U) {
+ return !U->usesScalars(PredR);
+ });
+ return false;
+ });
+}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
@@ -921,18 +977,21 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
O << " = ";
}
if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
- O << "call @" << CB->getCalledFunction()->getName() << "(";
+ O << "call";
+ printFlags(O);
+ O << "@" << CB->getCalledFunction()->getName() << "(";
interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
O << ")";
} else {
- O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
+ O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
+ printFlags(O);
printOperands(O, SlotTracker);
}
- if (AlsoPack)
+ if (shouldPack())
O << " (S->V)";
}
#endif
@@ -1053,20 +1112,22 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-bool VPCanonicalIVPHIRecipe::isCanonical(const InductionDescriptor &ID,
- Type *Ty) const {
- if (Ty != getScalarType())
+bool VPCanonicalIVPHIRecipe::isCanonical(
+ InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
+ Type *Ty) const {
+ // The types must match and it must be an integer induction.
+ if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
return false;
- // The start value of ID must match the start value of this canonical
- // induction.
- if (getStartValue()->getLiveInIRValue() != ID.getStartValue())
+ // Start must match the start value of this canonical induction.
+ if (Start != getStartValue())
return false;
- ConstantInt *Step = ID.getConstIntStepValue();
- // ID must also be incremented by one. IK_IntInduction always increment the
- // induction by Step, but the binary op may not be set.
- return ID.getKind() == InductionDescriptor::IK_IntInduction && Step &&
- Step->isOne();
+ // If the step is defined by a recipe, it is not a ConstantInt.
+ if (Step->getDefiningRecipe())
+ return false;
+
+ ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
+ return StepC && StepC->isOne();
}
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
@@ -1092,9 +1153,11 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) {
Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
&*State.Builder.GetInsertPoint());
-
+ assert(!State.ExpandedSCEVs.contains(Expr) &&
+ "Same SCEV expanded multiple times");
+ State.ExpandedSCEVs[Expr] = Res;
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
- State.set(this, Res, Part);
+ State.set(this, Res, {Part, 0});
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cbf111b00e3d..83bfdfd09d19 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,6 +12,8 @@
//===----------------------------------------------------------------------===//
#include "VPlanTransforms.h"
+#include "VPlanDominatorTree.h"
+#include "VPRecipeBuilder.h"
#include "VPlanCFG.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
@@ -22,11 +24,10 @@
using namespace llvm;
void VPlanTransforms::VPInstructionsToVPRecipes(
- Loop *OrigLoop, VPlanPtr &Plan,
+ VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
- SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE,
- const TargetLibraryInfo &TLI) {
+ ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan->getEntry());
@@ -39,22 +40,15 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPValue *VPV = Ingredient.getVPSingleValue();
Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
- if (DeadInstructions.count(Inst)) {
- VPValue DummyValue;
- VPV->replaceAllUsesWith(&DummyValue);
- Ingredient.eraseFromParent();
- continue;
- }
VPRecipeBase *NewRecipe = nullptr;
if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
- VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+ VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue());
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
- NewRecipe =
- new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true);
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
@@ -66,28 +60,25 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
// Create VPWidenMemoryInstructionRecipe for loads and stores.
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenMemoryInstructionRecipe(
- *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
- nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
+ *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
+ false /*Consecutive*/, false /*Reverse*/);
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenMemoryInstructionRecipe(
- *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
- Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/);
+ *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
+ nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
- NewRecipe = new VPWidenGEPRecipe(
- GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+ NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
NewRecipe =
- new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()),
+ new VPWidenCallRecipe(*CI, drop_end(Ingredient.operands()),
getVectorIntrinsicIDForCall(CI, &TLI));
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
- bool InvariantCond =
- SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop);
- NewRecipe = new VPWidenSelectRecipe(
- *SI, Plan->mapToVPValues(SI->operands()), InvariantCond);
+ NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
+ } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
+ NewRecipe = new VPWidenCastRecipe(
+ CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI);
} else {
- NewRecipe =
- new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
+ NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
}
}
@@ -98,15 +89,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
assert(NewRecipe->getNumDefinedValues() == 0 &&
"Only recpies with zero or one defined values expected");
Ingredient.eraseFromParent();
- Plan->removeVPValueFor(Inst);
- for (auto *Def : NewRecipe->definedValues()) {
- Plan->addVPValue(Inst, Def);
- }
}
}
}
-bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
+static bool sinkScalarOperands(VPlan &Plan) {
auto Iter = vp_depth_first_deep(Plan.getEntry());
bool Changed = false;
// First, collect the operands of all recipes in replicate blocks as seeds for
@@ -167,8 +154,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
continue;
Instruction *I = cast<Instruction>(
cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue());
- auto *Clone =
- new VPReplicateRecipe(I, SinkCandidate->operands(), true, false);
+ auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
// TODO: add ".cloned" suffix to name of Clone's VPValue.
Clone->insertBefore(SinkCandidate);
@@ -224,7 +210,10 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
return nullptr;
}
-bool VPlanTransforms::mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
+// Merge replicate regions in their successor region, if a replicate region
+// is connected to a successor replicate region with the same predicate by a
+// single, empty VPBasicBlock.
+static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
SetVector<VPRegionBlock *> DeletedRegions;
// Collect replicate regions followed by an empty block, followed by another
@@ -312,6 +301,81 @@ bool VPlanTransforms::mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
return !DeletedRegions.empty();
}
+static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
+ VPlan &Plan) {
+ Instruction *Instr = PredRecipe->getUnderlyingInstr();
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BlockInMask = PredRecipe->getMask();
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+
+ // Replace predicated replicate recipe with a replicate recipe without a
+ // mask but in the replicate region.
+ auto *RecipeWithoutMask = new VPReplicateRecipe(
+ PredRecipe->getUnderlyingInstr(),
+ make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
+ PredRecipe->isUniform());
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+
+ VPPredInstPHIRecipe *PHIRecipe = nullptr;
+ if (PredRecipe->getNumUsers() != 0) {
+ PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
+ PredRecipe->replaceAllUsesWith(PHIRecipe);
+ PHIRecipe->setOperand(0, RecipeWithoutMask);
+ }
+ PredRecipe->eraseFromParent();
+ auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exiting);
+
+ return Region;
+}
+
+static void addReplicateRegions(VPlan &Plan) {
+ SmallVector<VPReplicateRecipe *> WorkList;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getEntry()))) {
+ for (VPRecipeBase &R : *VPBB)
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+ if (RepR->isPredicated())
+ WorkList.push_back(RepR);
+ }
+ }
+
+ unsigned BBNum = 0;
+ for (VPReplicateRecipe *RepR : WorkList) {
+ VPBasicBlock *CurrentBlock = RepR->getParent();
+ VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
+
+ BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
+ SplitBlock->setName(
+ OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
+ // Record predicated instructions for above packing optimizations.
+ VPBlockBase *Region = createReplicateRegion(RepR, Plan);
+ Region->setParent(CurrentBlock->getParent());
+ VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock);
+ VPBlockUtils::connectBlocks(CurrentBlock, Region);
+ VPBlockUtils::connectBlocks(Region, SplitBlock);
+ }
+}
+
+void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
+ // Convert masked VPReplicateRecipes to if-then region blocks.
+ addReplicateRegions(Plan);
+
+ bool ShouldSimplify = true;
+ while (ShouldSimplify) {
+ ShouldSimplify = sinkScalarOperands(Plan);
+ ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
+ ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(Plan);
+ }
+}
bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
SmallVector<VPBasicBlock *> WorkList;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -395,7 +459,10 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
// everything WidenNewIV's users need. That is, WidenOriginalIV will
// generate a vector phi or all users of WidenNewIV demand the first lane
// only.
- if (WidenOriginalIV->needsVectorIV() ||
+ if (any_of(WidenOriginalIV->users(),
+ [WidenOriginalIV](VPUser *U) {
+ return !U->usesScalars(WidenOriginalIV);
+ }) ||
vputils::onlyFirstLaneUsed(WidenNewIV)) {
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
WidenNewIV->eraseFromParent();
@@ -440,10 +507,10 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
if (Instruction *TruncI = WideIV->getTruncInst())
ResultTy = TruncI->getType();
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *Step =
- vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE);
+ VPValue *Step = WideIV->getStepValue();
VPValue *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(ID, ResultTy)) {
+ if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step,
+ ResultTy)) {
BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV,
Step, ResultTy);
HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
@@ -522,9 +589,9 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
return;
LLVMContext &Ctx = SE.getContext();
- auto *BOC =
- new VPInstruction(VPInstruction::BranchOnCond,
- {Plan.getOrAddExternalDef(ConstantInt::getTrue(Ctx))});
+ auto *BOC = new VPInstruction(
+ VPInstruction::BranchOnCond,
+ {Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx))});
Term->eraseFromParent();
ExitingVPBB->appendRecipe(BOC);
Plan.setVF(BestVF);
@@ -533,3 +600,181 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
// 1. Replace inductions with constants.
// 2. Replace vector loop region with VPBasicBlock.
}
+
+#ifndef NDEBUG
+static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) {
+ auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ if (Region && Region->isReplicator()) {
+ assert(Region->getNumSuccessors() == 1 &&
+ Region->getNumPredecessors() == 1 && "Expected SESE region!");
+ assert(R->getParent()->size() == 1 &&
+ "A recipe in an original replicator region must be the only "
+ "recipe in its block");
+ return Region;
+ }
+ return nullptr;
+}
+#endif
+
+static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B,
+ VPDominatorTree &VPDT) {
+ if (A == B)
+ return false;
+
+ auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) {
+ for (auto &R : *A->getParent()) {
+ if (&R == A)
+ return true;
+ if (&R == B)
+ return false;
+ }
+ llvm_unreachable("recipe not found");
+ };
+ const VPBlockBase *ParentA = A->getParent();
+ const VPBlockBase *ParentB = B->getParent();
+ if (ParentA == ParentB)
+ return LocalComesBefore(A, B);
+
+ assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) &&
+ "No replicate regions expected at this point");
+ assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) &&
+ "No replicate regions expected at this point");
+ return VPDT.properlyDominates(ParentA, ParentB);
+}
+
+/// Sink users of \p FOR after the recipe defining the previous value \p
+/// Previous of the recurrence. \returns true if all users of \p FOR could be
+/// re-arranged as needed or false if it is not possible.
+static bool
+sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
+ VPRecipeBase *Previous,
+ VPDominatorTree &VPDT) {
+ // Collect recipes that need sinking.
+ SmallVector<VPRecipeBase *> WorkList;
+ SmallPtrSet<VPRecipeBase *, 8> Seen;
+ Seen.insert(Previous);
+ auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
+ // The previous value must not depend on the users of the recurrence phi. In
+ // that case, FOR is not a fixed order recurrence.
+ if (SinkCandidate == Previous)
+ return false;
+
+ if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
+ !Seen.insert(SinkCandidate).second ||
+ properlyDominates(Previous, SinkCandidate, VPDT))
+ return true;
+
+ if (SinkCandidate->mayHaveSideEffects())
+ return false;
+
+ WorkList.push_back(SinkCandidate);
+ return true;
+ };
+
+ // Recursively sink users of FOR after Previous.
+ WorkList.push_back(FOR);
+ for (unsigned I = 0; I != WorkList.size(); ++I) {
+ VPRecipeBase *Current = WorkList[I];
+ assert(Current->getNumDefinedValues() == 1 &&
+ "only recipes with a single defined value expected");
+
+ for (VPUser *User : Current->getVPSingleValue()->users()) {
+ if (auto *R = dyn_cast<VPRecipeBase>(User))
+ if (!TryToPushSinkCandidate(R))
+ return false;
+ }
+ }
+
+ // Keep recipes to sink ordered by dominance so earlier instructions are
+ // processed first.
+ sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
+ return properlyDominates(A, B, VPDT);
+ });
+
+ for (VPRecipeBase *SinkCandidate : WorkList) {
+ if (SinkCandidate == FOR)
+ continue;
+
+ SinkCandidate->moveAfter(Previous);
+ Previous = SinkCandidate;
+ }
+ return true;
+}
+
+bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
+ VPBuilder &Builder) {
+ VPDominatorTree VPDT;
+ VPDT.recalculate(Plan);
+
+ SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
+ if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+ RecurrencePhis.push_back(FOR);
+
+ for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
+ SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
+ VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
+ // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
+ // to terminate.
+ while (auto *PrevPhi =
+ dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
+ assert(PrevPhi->getParent() == FOR->getParent());
+ assert(SeenPhis.insert(PrevPhi).second);
+ Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
+ }
+
+ if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT))
+ return false;
+
+ // Introduce a recipe to combine the incoming and previous values of a
+ // fixed-order recurrence.
+ VPBasicBlock *InsertBlock = Previous->getParent();
+ if (isa<VPHeaderPHIRecipe>(Previous))
+ Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
+ else
+ Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator()));
+
+ auto *RecurSplice = cast<VPInstruction>(
+ Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
+ {FOR, FOR->getBackedgeValue()}));
+
+ FOR->replaceAllUsesWith(RecurSplice);
+ // Set the first operand of RecurSplice to FOR again, after replacing
+ // all users.
+ RecurSplice->setOperand(0, FOR);
+ }
+ return true;
+}
+
+void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR)
+ continue;
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RK != RecurKind::Add && RK != RecurKind::Mul)
+ continue;
+
+ SmallSetVector<VPValue *, 8> Worklist;
+ Worklist.insert(PhiR);
+
+ for (unsigned I = 0; I != Worklist.size(); ++I) {
+ VPValue *Cur = Worklist[I];
+ if (auto *RecWithFlags =
+ dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) {
+ RecWithFlags->dropPoisonGeneratingFlags();
+ }
+
+ for (VPUser *U : Cur->users()) {
+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!UserRecipe)
+ continue;
+ for (VPValue *V : UserRecipe->definedValues())
+ Worklist.insert(V);
+ }
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index be0d8e76d809..3eccf6e9600d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -25,23 +25,23 @@ class ScalarEvolution;
class Loop;
class PredicatedScalarEvolution;
class TargetLibraryInfo;
+class VPBuilder;
+class VPRecipeBuilder;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes.
static void
- VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan,
+ VPInstructionsToVPRecipes(VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
- SmallPtrSetImpl<Instruction *> &DeadInstructions,
ScalarEvolution &SE, const TargetLibraryInfo &TLI);
- static bool sinkScalarOperands(VPlan &Plan);
-
- /// Merge replicate regions in their successor region, if a replicate region
- /// is connected to a successor replicate region with the same predicate by a
- /// single, empty VPBasicBlock.
- static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan);
+ /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
+ /// region block and remove the mask operand. Optimize the created regions by
+ /// iteratively sinking scalar operands into the region, followed by merging
+ /// regions until no improvements are remaining.
+ static void createAndOptimizeReplicateRegions(VPlan &Plan);
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
/// the predecessor has a single successor.
@@ -71,6 +71,19 @@ struct VPlanTransforms {
/// them with already existing recipes expanding the same SCEV expression.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan);
+ /// Sink users of fixed-order recurrences after the recipe defining their
+ /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions
+ /// to combine the value from the recurrence phis and previous values. The
+ /// current implementation assumes all users can be sunk after the previous
+ /// value, which is enforced by earlier legality checks.
+ /// \returns true if all users of fixed-order recurrences could be re-arranged
+ /// as needed or false if it is not possible. In the latter case, \p Plan is
+ /// not valid.
+ static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
+
+ /// Clear NSW/NUW flags from reduction instructions if necessary.
+ static void clearReductionWrapFlags(VPlan &Plan);
+
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 62ec65cbfe5d..ac110bb3b0ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -171,16 +171,19 @@ public:
/// Returns true if this VPValue is defined by a recipe.
bool hasDefiningRecipe() const { return getDefiningRecipe(); }
+ /// Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
+ bool isLiveIn() const { return !hasDefiningRecipe(); }
+
/// Returns the underlying IR value, if this VPValue is defined outside the
/// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
/// inside a VPlan.
Value *getLiveInIRValue() {
- assert(!hasDefiningRecipe() &&
+ assert(isLiveIn() &&
"VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
return getUnderlyingValue();
}
const Value *getLiveInIRValue() const {
- assert(!hasDefiningRecipe() &&
+ assert(isLiveIn() &&
"VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
return getUnderlyingValue();
}
@@ -342,15 +345,16 @@ public:
VPScalarIVStepsSC,
VPWidenCallSC,
VPWidenCanonicalIVSC,
+ VPWidenCastSC,
VPWidenGEPSC,
VPWidenMemoryInstructionSC,
VPWidenSC,
VPWidenSelectSC,
-
- // Phi-like recipes. Need to be kept together.
+ // START: Phi-like recipes. Need to be kept together.
VPBlendSC,
VPPredInstPHISC,
- // Header-phi recipes. Need to be kept together.
+ // START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
+ // VPHeaderPHIRecipe need to be kept together.
VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
VPFirstOrderRecurrencePHISC,
@@ -358,8 +362,11 @@ public:
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
VPReductionPHISC,
+ // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
+ // END: Phi-like recipes
VPFirstPHISC = VPBlendSC,
VPFirstHeaderPHISC = VPCanonicalIVPHISC,
+ VPLastHeaderPHISC = VPReductionPHISC,
VPLastPHISC = VPReductionPHISC,
};
@@ -434,6 +441,7 @@ class VPSlotTracker {
void assignSlot(const VPValue *V);
void assignSlots(const VPlan &Plan);
+ void assignSlots(const VPBasicBlock *VPBB);
public:
VPSlotTracker(const VPlan *Plan = nullptr) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 18125cebed33..d6b81543dbc9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -15,6 +15,7 @@
#include "VPlanVerifier.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/Support/CommandLine.h"
@@ -189,9 +190,8 @@ static bool verifyPhiRecipes(const VPBasicBlock *VPBB) {
return true;
}
-static bool
-verifyVPBasicBlock(const VPBasicBlock *VPBB,
- DenseMap<const VPBlockBase *, unsigned> &BlockNumbering) {
+static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
+ VPDominatorTree &VPDT) {
if (!verifyPhiRecipes(VPBB))
return false;
@@ -206,7 +206,8 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB,
for (const VPValue *V : R.definedValues()) {
for (const VPUser *U : V->users()) {
auto *UI = dyn_cast<VPRecipeBase>(U);
- if (!UI || isa<VPHeaderPHIRecipe>(UI))
+ // TODO: check dominance of incoming values for phis properly.
+ if (!UI || isa<VPHeaderPHIRecipe>(UI) || isa<VPPredInstPHIRecipe>(UI))
continue;
// If the user is in the same block, check it comes after R in the
@@ -219,27 +220,7 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB,
continue;
}
- // Skip blocks outside any region for now and blocks outside
- // replicate-regions.
- auto *ParentR = VPBB->getParent();
- if (!ParentR || !ParentR->isReplicator())
- continue;
-
- // For replicators, verify that VPPRedInstPHIRecipe defs are only used
- // in subsequent blocks.
- if (isa<VPPredInstPHIRecipe>(&R)) {
- auto I = BlockNumbering.find(UI->getParent());
- unsigned BlockNumber = I == BlockNumbering.end() ? std::numeric_limits<unsigned>::max() : I->second;
- if (BlockNumber < BlockNumbering[ParentR]) {
- errs() << "Use before def!\n";
- return false;
- }
- continue;
- }
-
- // All non-VPPredInstPHIRecipe recipes in the block must be used in
- // the replicate region only.
- if (UI->getParent()->getParent() != ParentR) {
+ if (!VPDT.dominates(VPBB, UI->getParent())) {
errs() << "Use before def!\n";
return false;
}
@@ -250,15 +231,13 @@ verifyVPBasicBlock(const VPBasicBlock *VPBB,
}
bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
- DenseMap<const VPBlockBase *, unsigned> BlockNumbering;
- unsigned Cnt = 0;
+ VPDominatorTree VPDT;
+ VPDT.recalculate(const_cast<VPlan &>(Plan));
+
auto Iter = vp_depth_first_deep(Plan.getEntry());
- for (const VPBlockBase *VPB : Iter) {
- BlockNumbering[VPB] = Cnt++;
- auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
- if (!VPBB)
- continue;
- if (!verifyVPBasicBlock(VPBB, BlockNumbering))
+ for (const VPBasicBlock *VPBB :
+ VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {
+ if (!verifyVPBasicBlock(VPBB, VPDT))
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2e489757ebc1..13464c9d3496 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -25,11 +25,8 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
#include <numeric>
#define DEBUG_TYPE "vector-combine"
@@ -247,7 +244,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// still need a shuffle to change the vector size.
auto *Ty = cast<FixedVectorType>(I.getType());
unsigned OutputNumElts = Ty->getNumElements();
- SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+ SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
Mask[0] = OffsetEltIndex;
if (OffsetEltIndex)
@@ -460,9 +457,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// If we are extracting from 2 different indexes, then one operand must be
// shuffled before performing the vector operation. The shuffle mask is
- // undefined except for 1 lane that is being translated to the remaining
+ // poison except for 1 lane that is being translated to the remaining
// extraction lane. Therefore, it is a splat shuffle. Ex:
- // ShufMask = { undef, undef, 0, undef }
+ // ShufMask = { poison, poison, 0, poison }
// TODO: The cost model has an option for a "broadcast" shuffle
// (splat-from-element-0), but no option for a more general splat.
NewCost +=
@@ -479,11 +476,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
/// to a new element location.
static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
unsigned NewIndex, IRBuilder<> &Builder) {
- // The shuffle mask is undefined except for 1 lane that is being translated
+ // The shuffle mask is poison except for 1 lane that is being translated
// to the new element index. Example for OldIndex == 2 and NewIndex == 0:
- // ShufMask = { 2, undef, undef, undef }
+ // ShufMask = { 2, poison, poison, poison }
auto *VecTy = cast<FixedVectorType>(Vec->getType());
- SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+ SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
ShufMask[NewIndex] = OldIndex;
return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
}
@@ -917,7 +914,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
InstructionCost NewCost = TTI.getCmpSelInstrCost(
CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
- SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+ SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
ShufMask[CheapIndex] = ExpensiveIndex;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
ShufMask);
@@ -932,7 +929,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
// Create a vector constant from the 2 scalar constants.
SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
- UndefValue::get(VecTy->getElementType()));
+ PoisonValue::get(VecTy->getElementType()));
CmpC[Index0] = C0;
CmpC[Index1] = C1;
Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
@@ -1565,7 +1562,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
// Calculate our ReconstructMasks from the OrigReconstructMasks and the
// modified order of the input shuffles.
SmallVector<SmallVector<int>> ReconstructMasks;
- for (auto Mask : OrigReconstructMasks) {
+ for (const auto &Mask : OrigReconstructMasks) {
SmallVector<int> ReconstructMask;
for (int M : Mask) {
auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
@@ -1596,12 +1593,12 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
}
while (V1A.size() < NumElts) {
- V1A.push_back(UndefMaskElem);
- V1B.push_back(UndefMaskElem);
+ V1A.push_back(PoisonMaskElem);
+ V1B.push_back(PoisonMaskElem);
}
while (V2A.size() < NumElts) {
- V2A.push_back(UndefMaskElem);
- V2B.push_back(UndefMaskElem);
+ V2A.push_back(PoisonMaskElem);
+ V2B.push_back(PoisonMaskElem);
}
auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
@@ -1660,16 +1657,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
return SSV->getOperand(Op);
return SV->getOperand(Op);
};
- Builder.SetInsertPoint(SVI0A->getNextNode());
+ Builder.SetInsertPoint(SVI0A->getInsertionPointAfterDef());
Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
GetShuffleOperand(SVI0A, 1), V1A);
- Builder.SetInsertPoint(SVI0B->getNextNode());
+ Builder.SetInsertPoint(SVI0B->getInsertionPointAfterDef());
Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
GetShuffleOperand(SVI0B, 1), V1B);
- Builder.SetInsertPoint(SVI1A->getNextNode());
+ Builder.SetInsertPoint(SVI1A->getInsertionPointAfterDef());
Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
GetShuffleOperand(SVI1A, 1), V2A);
- Builder.SetInsertPoint(SVI1B->getNextNode());
+ Builder.SetInsertPoint(SVI1B->getInsertionPointAfterDef());
Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
GetShuffleOperand(SVI1B, 1), V2B);
Builder.SetInsertPoint(Op0);
@@ -1811,54 +1808,6 @@ bool VectorCombine::run() {
return MadeChange;
}
-// Pass manager boilerplate below here.
-
-namespace {
-class VectorCombineLegacyPass : public FunctionPass {
-public:
- static char ID;
- VectorCombineLegacyPass() : FunctionPass(ID) {
- initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.setPreservesCFG();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- VectorCombine Combiner(F, TTI, DT, AA, AC, false);
- return Combiner.run();
- }
-};
-} // namespace
-
-char VectorCombineLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
- "Optimize scalar/vector ops", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
- "Optimize scalar/vector ops", false, false)
-Pass *llvm::createVectorCombinePass() {
- return new VectorCombineLegacyPass();
-}
-
PreservedAnalyses VectorCombinePass::run(Function &F,
FunctionAnalysisManager &FAM) {
auto &AC = FAM.getResult<AssumptionAnalysis>(F);
diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 208e5eeea864..2f5048d2a664 100644
--- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -12,10 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/Vectorize.h"
-#include "llvm/IR/LegacyPassManager.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassRegistry.h"
@@ -23,20 +19,5 @@ using namespace llvm;
/// Initialize all passes linked into the Vectorization library.
void llvm::initializeVectorization(PassRegistry &Registry) {
- initializeLoopVectorizePass(Registry);
- initializeSLPVectorizerPass(Registry);
initializeLoadStoreVectorizerLegacyPassPass(Registry);
- initializeVectorCombineLegacyPassPass(Registry);
-}
-
-void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
- initializeVectorization(*unwrap(R));
-}
-
-void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopVectorizePass());
-}
-
-void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createSLPVectorizerPass());
}