diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) | |
download | src-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 2049 |
1 files changed, 1112 insertions, 937 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 0b7fc853dc1b..260d7889906b 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -37,13 +37,34 @@ // multiple scalar registers, similar to a GPU vectorized load. In theory ARM // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +// +// Overview of the algorithm and terminology in this pass: +// +// - Break up each basic block into pseudo-BBs, composed of instructions which +// are guaranteed to transfer control to their successors. +// - Within a single pseudo-BB, find all loads, and group them into +// "equivalence classes" according to getUnderlyingObject() and loaded +// element size. Do the same for stores. +// - For each equivalence class, greedily build "chains". Each chain has a +// leader instruction, and every other member of the chain has a known +// constant offset from the first instr in the chain. +// - Break up chains so that they contain only contiguous accesses of legal +// size with no intervening may-alias instrs. +// - Convert each chain to vector instructions. +// +// The O(n^2) behavior of this pass comes from initially building the chains. +// In the worst case we have to compare each new instruction to all of those +// that came before. To limit this, we only calculate the offset to the leaders +// of the N most recently-used chains. #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -57,6 +78,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -67,23 +89,33 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/ModRef.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <cassert> +#include <cstdint> #include <cstdlib> +#include <iterator> +#include <limits> +#include <numeric> +#include <optional> #include <tuple> +#include <type_traits> #include <utility> +#include <vector> using namespace llvm; @@ -92,21 +124,115 @@ using namespace llvm; STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); +namespace { + +// Equivalence class key, the initial tuple by which we group loads/stores. +// Loads/stores with different EqClassKeys are never merged. +// +// (We could in theory remove element-size from the this tuple. We'd just need +// to fix up the vector packing/unpacking code.) +using EqClassKey = + std::tuple<const Value * /* result of getUnderlyingObject() */, + unsigned /* AddrSpace */, + unsigned /* Load/Store element size bits */, + char /* IsLoad; char b/c bool can't be a DenseMap key */ + >; +[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const EqClassKey &K) { + const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K; + return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject + << " of element size " << ElementSize << " bits in addrspace " + << AddrSpace; +} + +// A Chain is a set of instructions such that: +// - All instructions have the same equivalence class, so in particular all are +// loads, or all are stores. +// - We know the address accessed by the i'th chain elem relative to the +// chain's leader instruction, which is the first instr of the chain in BB +// order. +// +// Chains have two canonical orderings: +// - BB order, sorted by Instr->comesBefore. +// - Offset order, sorted by OffsetFromLeader. +// This pass switches back and forth between these orders. +struct ChainElem { + Instruction *Inst; + APInt OffsetFromLeader; +}; +using Chain = SmallVector<ChainElem, 1>; + +void sortChainInBBOrder(Chain &C) { + sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); }); +} + +void sortChainInOffsetOrder(Chain &C) { + sort(C, [](const auto &A, const auto &B) { + if (A.OffsetFromLeader != B.OffsetFromLeader) + return A.OffsetFromLeader.slt(B.OffsetFromLeader); + return A.Inst->comesBefore(B.Inst); // stable tiebreaker + }); +} + +[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) { + for (const auto &E : C) { + dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n"; + } +} + +using EquivalenceClassMap = + MapVector<EqClassKey, SmallVector<Instruction *, 8>>; + // FIXME: Assuming stack alignment of 4 is always good enough -static const unsigned StackAdjustedAlignment = 4; +constexpr unsigned StackAdjustedAlignment = 4; -namespace { +Instruction *propagateMetadata(Instruction *I, const Chain &C) { + SmallVector<Value *, 8> Values; + for (const ChainElem &E : C) + Values.push_back(E.Inst); + return propagateMetadata(I, Values); +} -/// ChainID is an arbitrary token that is allowed to be different only for the -/// accesses that are guaranteed to be considered non-consecutive by -/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions -/// together and reducing the number of instructions the main search operates on -/// at a time, i.e. this is to reduce compile time and nothing else as the main -/// search has O(n^2) time complexity. The underlying type of ChainID should not -/// be relied upon. -using ChainID = const Value *; -using InstrList = SmallVector<Instruction *, 8>; -using InstrListMap = MapVector<ChainID, InstrList>; +bool isInvariantLoad(const Instruction *I) { + const LoadInst *LI = dyn_cast<LoadInst>(I); + return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load); +} + +/// Reorders the instructions that I depends on (the instructions defining its +/// operands), to ensure they dominate I. +void reorder(Instruction *I) { + SmallPtrSet<Instruction *, 16> InstructionsToMove; + SmallVector<Instruction *, 16> Worklist; + + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *IW = Worklist.pop_back_val(); + int NumOperands = IW->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; + + // If IM is in another BB, no need to move it, because this pass only + // vectorizes instructions within one BB. + if (IM->getParent() != I->getParent()) + continue; + + if (!IM->comesBefore(I)) { + InstructionsToMove.insert(IM); + Worklist.push_back(IM); + } + } + } + + // All instructions to move should follow I. Start from I, not from begin(). + for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) { + Instruction *IM = &*(BBI++); + if (!InstructionsToMove.count(IM)) + continue; + IM->moveBefore(I); + } +} class Vectorizer { Function &F; @@ -118,6 +244,12 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; + // We could erase instrs right after vectorizing them, but that can mess up + // our BB iterators, and also can make the equivalence class keys point to + // freed memory. This is fixable, but it's simpler just to wait until we're + // done with the BB and erase all at once. + SmallVector<Instruction *, 128> ToErase; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -127,70 +259,83 @@ public: bool run(); private: - unsigned getPointerAddressSpace(Value *I); - static const unsigned MaxDepth = 3; - bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth = 0) const; - bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth) const; - bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, - unsigned Depth) const; - - /// After vectorization, reorder the instructions that I depends on - /// (the instructions defining its operands), to ensure they dominate I. - void reorder(Instruction *I); - - /// Returns the first and the last instructions in Chain. - std::pair<BasicBlock::iterator, BasicBlock::iterator> - getBoundaryInstrs(ArrayRef<Instruction *> Chain); - - /// Erases the original instructions after vectorizing. - void eraseInstructions(ArrayRef<Instruction *> Chain); - - /// "Legalize" the vector type that would be produced by combining \p - /// ElementSizeBits elements in \p Chain. Break into two pieces such that the - /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is - /// expected to have more than 4 elements. - std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> - splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); - - /// Finds the largest prefix of Chain that's vectorizable, checking for - /// intervening instructions which may affect the memory accessed by the - /// instructions within Chain. + /// Runs the vectorizer on a "pseudo basic block", which is a range of + /// instructions [Begin, End) within one BB all of which have + /// isGuaranteedToTransferExecutionToSuccessor(I) == true. + bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); + + /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores + /// in the same BB with the same value for getUnderlyingObject() etc. + bool runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef<Instruction *> EqClass); + + /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class + /// where all instructions access a known, constant offset from the first + /// instruction. + bool runOnChain(Chain &C); + + /// Splits the chain into subchains of instructions which read/write a + /// contiguous block of memory. Discards any length-1 subchains (because + /// there's nothing to vectorize in there). + std::vector<Chain> splitChainByContiguity(Chain &C); + + /// Splits the chain into subchains where it's safe to hoist loads up to the + /// beginning of the sub-chain and it's safe to sink loads up to the end of + /// the sub-chain. Discards any length-1 subchains. + std::vector<Chain> splitChainByMayAliasInstrs(Chain &C); + + /// Splits the chain into subchains that make legal, aligned accesses. + /// Discards any length-1 subchains. + std::vector<Chain> splitChainByAlignment(Chain &C); + + /// Converts the instrs in the chain into a single vectorized load or store. + /// Adds the old scalar loads/stores to ToErase. + bool vectorizeChain(Chain &C); + + /// Tries to compute the offset in bytes PtrB - PtrA. + std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth = 0); + std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth); + std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth); + + /// Gets the element type of the vector that the chain will load or store. + /// This is nontrivial because the chain may contain elements of different + /// types; e.g. it's legal to have a chain that contains both i32 and float. + Type *getChainElemTy(const Chain &C); + + /// Determines whether ChainElem can be moved up (if IsLoad) or down (if + /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias + /// instructions. + /// + /// The map ChainElemOffsets must contain all of the elements in + /// [ChainBegin, ChainElem] and their offsets from some arbitrary base + /// address. It's ok if it contains additional entries. + template <bool IsLoadChain> + bool isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); + + /// Collects loads and stores grouped by "equivalence class", where: + /// - all elements in an eq class are a load or all are a store, + /// - they all load/store the same element size (it's OK to have e.g. i8 and + /// <4 x i8> in the same class, but not i32 and <4 x i8>), and + /// - they all have the same value for getUnderlyingObject(). + EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End); + + /// Partitions Instrs into "chains" where every instruction has a known + /// constant offset from the first instr in the chain. /// - /// The elements of \p Chain must be all loads or all stores and must be in - /// address order. - ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); - - /// Collects load and store instructions to vectorize. - std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); - - /// Processes the collected instructions, the \p Map. The values of \p Map - /// should be all loads or all stores. - bool vectorizeChains(InstrListMap &Map); - - /// Finds the load/stores to consecutive memory addresses and vectorizes them. - bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); - - /// Vectorizes the load instructions in Chain. - bool - vectorizeLoadChain(ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed); - - /// Vectorizes the store instructions in Chain. - bool - vectorizeStoreChain(ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed); - - /// Check if this load/store access is misaligned accesses. - /// Returns a \p RelativeSpeed of an operation if allowed suitable to - /// compare to another result for the same \p AddressSpace and potentially - /// different \p Alignment and \p SzInBytes. - bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed); + /// Postcondition: For all i, ret[i][0].second == 0, because the first instr + /// in the chain is the leader, and an instr touches distance 0 from itself. + std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -198,7 +343,8 @@ public: static char ID; LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { - initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); + initializeLoadStoreVectorizerLegacyPassPass( + *PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -250,11 +396,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { AssumptionCache &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - return V.run(); + return Vectorizer(F, AA, AC, DT, SE, TTI).run(); } -PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { +PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, + FunctionAnalysisManager &AM) { // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return PreservedAnalyses::all(); @@ -265,125 +411,681 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - bool Changed = V.run(); + bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); return Changed ? PA : PreservedAnalyses::all(); } -// The real propagateMetadata expects a SmallVector<Value*>, but we deal in -// vectors of Instructions. -static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { - SmallVector<Value *, 8> VL(IL.begin(), IL.end()); - propagateMetadata(I, VL); -} - -// Vectorizer Implementation bool Vectorizer::run() { bool Changed = false; - - // Scan the blocks in the function in post order. + // Break up the BB if there are any instrs which aren't guaranteed to transfer + // execution to their successor. + // + // Consider, for example: + // + // def assert_arr_len(int n) { if (n < 2) exit(); } + // + // load arr[0] + // call assert_array_len(arr.length) + // load arr[1] + // + // Even though assert_arr_len does not read or write any memory, we can't + // speculate the second load before the call. More info at + // https://github.com/llvm/llvm-project/issues/52950. for (BasicBlock *BB : post_order(&F)) { - InstrListMap LoadRefs, StoreRefs; - std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); - Changed |= vectorizeChains(LoadRefs); - Changed |= vectorizeChains(StoreRefs); + // BB must at least have a terminator. + assert(!BB->empty()); + + SmallVector<BasicBlock::iterator, 8> Barriers; + Barriers.push_back(BB->begin()); + for (Instruction &I : *BB) + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + Barriers.push_back(I.getIterator()); + Barriers.push_back(BB->end()); + + for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End; + ++It) + Changed |= runOnPseudoBB(*It, *std::next(It)); + + for (Instruction *I : ToErase) { + auto *PtrOperand = getLoadStorePointerOperand(I); + if (I->use_empty()) + I->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + ToErase.clear(); } return Changed; } -unsigned Vectorizer::getPointerAddressSpace(Value *I) { - if (LoadInst *L = dyn_cast<LoadInst>(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast<StoreInst>(I)) - return S->getPointerAddressSpace(); - return -1; +bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... "; + if (End != Begin->getParent()->end()) + dbgs() << *End; + else + dbgs() << "<BB end>"; + dbgs() << ")\n"; + }); + + bool Changed = false; + for (const auto &[EqClassKey, EqClass] : + collectEquivalenceClasses(Begin, End)) + Changed |= runOnEquivalenceClass(EqClassKey, EqClass); + + return Changed; } -// FIXME: Merge with llvm::isConsecutiveAccess -bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getLoadStorePointerOperand(A); - Value *PtrB = getLoadStorePointerOperand(B); - unsigned ASA = getPointerAddressSpace(A); - unsigned ASB = getPointerAddressSpace(B); +bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef<Instruction *> EqClass) { + bool Changed = false; - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; + LLVM_DEBUG({ + dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() + << " keyed on " << EqClassKey << ":\n"; + for (Instruction *I : EqClass) + dbgs() << " " << *I << "\n"; + }); - // Make sure that A and B are different pointers of the same size type. - Type *PtrATy = getLoadStoreType(A); - Type *PtrBTy = getLoadStoreType(B); - if (PtrA == PtrB || - PtrATy->isVectorTy() != PtrBTy->isVectorTy() || - DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || - DL.getTypeStoreSize(PtrATy->getScalarType()) != - DL.getTypeStoreSize(PtrBTy->getScalarType())) - return false; + std::vector<Chain> Chains = gatherChains(EqClass); + LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() + << " nontrivial chains.\n";); + for (Chain &C : Chains) + Changed |= runOnChain(C); + return Changed; +} - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); - APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); +bool Vectorizer::runOnChain(Chain &C) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; + dumpChain(C); + }); - return areConsecutivePointers(PtrA, PtrB, Size); + // Split up the chain into increasingly smaller chains, until we can finally + // vectorize the chains. + // + // (Don't be scared by the depth of the loop nest here. These operations are + // all at worst O(n lg n) in the number of instructions, and splitting chains + // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) + bool Changed = false; + for (auto &C : splitChainByMayAliasInstrs(C)) + for (auto &C : splitChainByContiguity(C)) + for (auto &C : splitChainByAlignment(C)) + Changed |= vectorizeChain(C); + return Changed; } -bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, - APInt PtrDelta, unsigned Depth) const { - unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); - APInt OffsetA(PtrBitWidth, 0); - APInt OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); +std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) { + if (C.empty()) + return {}; - unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + sortChainInBBOrder(C); - if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n"; + dumpChain(C); + }); + + // We know that elements in the chain with nonverlapping offsets can't + // alias, but AA may not be smart enough to figure this out. Use a + // hashtable. + DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets; + for (const auto &E : C) + ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader}); + + // Loads get hoisted up to the first load in the chain. Stores get sunk + // down to the last store in the chain. Our algorithm for loads is: + // + // - Take the first element of the chain. This is the start of a new chain. + // - Take the next element of `Chain` and check for may-alias instructions + // up to the start of NewChain. If no may-alias instrs, add it to + // NewChain. Otherwise, start a new NewChain. + // + // For stores it's the same except in the reverse direction. + // + // We expect IsLoad to be an std::bool_constant. + auto Impl = [&](auto IsLoad) { + // MSVC is unhappy if IsLoad is a capture, so pass it as an arg. + auto [ChainBegin, ChainEnd] = [&](auto IsLoad) { + if constexpr (IsLoad()) + return std::make_pair(C.begin(), C.end()); + else + return std::make_pair(C.rbegin(), C.rend()); + }(IsLoad); + assert(ChainBegin != ChainEnd); + + std::vector<Chain> Chains; + SmallVector<ChainElem, 1> NewChain; + NewChain.push_back(*ChainBegin); + for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { + if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, + ChainOffsets)) { + LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst + << "\n"); + NewChain.push_back(*ChainIt); + } else { + LLVM_DEBUG( + dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + + // Start a new chain. + NewChain = SmallVector<ChainElem, 1>({*ChainIt}); + } + } + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + return Chains; + }; + + if (isa<LoadInst>(C[0].Inst)) + return Impl(/*IsLoad=*/std::bool_constant<true>()); + + assert(isa<StoreInst>(C[0].Inst)); + return Impl(/*IsLoad=*/std::bool_constant<false>()); +} + +std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) { + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByContiguity considering chain:\n"; + dumpChain(C); + }); + + std::vector<Chain> Ret; + Ret.push_back({C.front()}); + + for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) { + // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd). + auto &CurChain = Ret.back(); + const ChainElem &Prev = CurChain.back(); + unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); + assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " + "collectEquivalenceClass"); + APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; + + // Add this instruction to the end of the current chain, or start a new one. + bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; + LLVM_DEBUG(dbgs() << "LSV: Instructions are " + << (AreContiguous ? "" : "not ") << "contiguous: " + << *Prev.Inst << " (ends at offset " << PrevReadEnd + << ") -> " << *It->Inst << " (starts at offset " + << It->OffsetFromLeader << ")\n"); + if (AreContiguous) + CurChain.push_back(*It); + else + Ret.push_back({*It}); + } + + // Filter out length-1 chains, these are uninteresting. + llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); + return Ret; +} + +Type *Vectorizer::getChainElemTy(const Chain &C) { + assert(!C.empty()); + // The rules are: + // - If there are any pointer types in the chain, use an integer type. + // - Prefer an integer type if it appears in the chain. + // - Otherwise, use the first type in the chain. + // + // The rule about pointer types is a simplification when we merge e.g. a load + // of a ptr and a double. There's no direct conversion from a ptr to a + // double; it requires a ptrtoint followed by a bitcast. + // + // It's unclear to me if the other rules have any practical effect, but we do + // it to match this pass's previous behavior. + if (any_of(C, [](const ChainElem &E) { + return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); + })) { + return Type::getIntNTy( + F.getContext(), + DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); + } + + for (const ChainElem &E : C) + if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy()) + return T; + return getLoadStoreType(C[0].Inst)->getScalarType(); +} + +std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) { + // We use a simple greedy algorithm. + // - Given a chain of length N, find all prefixes that + // (a) are not longer than the max register length, and + // (b) are a power of 2. + // - Starting from the longest prefix, try to create a vector of that length. + // - If one of them works, great. Repeat the algorithm on any remaining + // elements in the chain. + // - If none of them work, discard the first element and repeat on a chain + // of length N-1. + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByAlignment considering chain:\n"; + dumpChain(C); + }); + + bool IsLoadChain = isa<LoadInst>(C[0].Inst); + auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize, + unsigned ChainSizeBytes, VectorType *VecTy) { + return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy) + : TTI.getStoreVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy); + }; + +#ifndef NDEBUG + for (const auto &E : C) { + Type *Ty = getLoadStoreType(E.Inst)->getScalarType(); + assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) && + "Should have filtered out non-power-of-two elements in " + "collectEquivalenceClasses."); + } +#endif + + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + + std::vector<Chain> Ret; + for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { + // Find candidate chains of size not greater than the largest vector reg. + // These chains are over the closed interval [CBegin, CEnd]. + SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8> + CandidateChains; + for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) { + APInt Sz = C[CEnd].OffsetFromLeader + + DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) - + C[CBegin].OffsetFromLeader; + if (Sz.sgt(VecRegBytes)) + break; + CandidateChains.push_back( + {CEnd, static_cast<unsigned>(Sz.getLimitedValue())}); + } + + // Consider the longest chain first. + for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend(); + It != End; ++It) { + auto [CEnd, SizeBytes] = *It; + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment considering candidate chain [" + << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n"); + + Type *VecElemTy = getChainElemTy(C); + // Note, VecElemTy is a power of 2, but might be less than one byte. For + // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case + // VecElemTy would be i4. + unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy); + + // SizeBytes and VecElemBits are powers of 2, so they divide evenly. + assert((8 * SizeBytes) % VecElemBits == 0); + unsigned NumVecElems = 8 * SizeBytes / VecElemBits; + FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems); + unsigned VF = 8 * VecRegBytes / VecElemBits; + + // Check that TTI is happy with this vectorization factor. + unsigned TargetVF = getVectorFactor(VF, VecElemBits, + VecElemBits * NumVecElems / 8, VecTy); + if (TargetVF != VF && TargetVF < NumVecElems) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because TargetVF=" + << TargetVF << " != VF=" << VF + << " and TargetVF < NumVecElems=" << NumVecElems << "\n"); + continue; + } + + // Is a load/store with this alignment allowed by TTI and at least as fast + // as an unvectorized load/store? + // + // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). + auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, + &F = F](Align Alignment) { + if (Alignment.value() % SizeBytes == 0) + return true; + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; + }; + + // If we're loading/storing from an alloca, align it if possible. + // + // FIXME: We eagerly upgrade the alignment, regardless of whether TTI + // tells us this is beneficial. This feels a bit odd, but it matches + // existing tests. This isn't *so* bad, because at most we align to 4 + // bytes (current value of StackAdjustedAlignment). + // + // FIXME: We will upgrade the alignment of the alloca even if it turns out + // we can't vectorize for some other reason. + Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst); + bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() && + isa<AllocaInst>(PtrOperand->stripPointerCasts()); + Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); + Align PrefAlign = Align(StackAdjustedAlignment); + if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && + IsAllowedAndFast(PrefAlign)) { + Align NewAlign = getOrEnforceKnownAlignment( + PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); + if (NewAlign >= Alignment) { + LLVM_DEBUG(dbgs() + << "LSV: splitByChain upgrading alloca alignment from " + << Alignment.value() << " to " << NewAlign.value() + << "\n"); + Alignment = NewAlign; + } + } + + if (!IsAllowedAndFast(Alignment)) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because its alignment is not AllowedAndFast: " + << Alignment.value() << "\n"); + continue; + } + + if ((IsLoadChain && + !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) || + (!IsLoadChain && + !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because !isLegalToVectorizeLoad/StoreChain."); + continue; + } + + // Hooray, we can vectorize this chain! + Chain &NewChain = Ret.emplace_back(); + for (unsigned I = CBegin; I <= CEnd; ++I) + NewChain.push_back(C[I]); + CBegin = CEnd; // Skip over the instructions we've added to the chain. + break; + } + } + return Ret; +} + +bool Vectorizer::vectorizeChain(Chain &C) { + if (C.size() < 2) return false; - // In case if we have to shrink the pointer - // stripAndAccumulateInBoundsConstantOffsets should properly handle a - // possible overflow and the value should fit into a smallest data type - // used in the cast/gep chain. - assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && - OffsetB.getMinSignedBits() <= NewPtrBitWidth); + sortChainInOffsetOrder(C); - OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); - OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); - PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); + LLVM_DEBUG({ + dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n"; + dumpChain(C); + }); - APInt OffsetDelta = OffsetB - OffsetA; + Type *VecElemTy = getChainElemTy(C); + bool IsLoadChain = isa<LoadInst>(C[0].Inst); + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned ChainBytes = std::accumulate( + C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) { + return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst)); + }); + assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0); + // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller + // than 1 byte (e.g. VecTy == <32 x i1>). + Type *VecTy = FixedVectorType::get( + VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy)); + + Align Alignment = getLoadStoreAlignment(C[0].Inst); + // If this is a load/store of an alloca, we might have upgraded the alloca's + // alignment earlier. Get the new alignment. + if (AS == DL.getAllocaAddrSpace()) { + Alignment = std::max( + Alignment, + getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst), + MaybeAlign(), DL, C[0].Inst, nullptr, &DT)); + } - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == PtrDelta; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the pointer delta requested. - APInt BaseDelta = PtrDelta - OffsetDelta; - - // Compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE.getSCEV(PtrA); - const SCEV *PtrSCEVB = SE.getSCEV(PtrB); - const SCEV *C = SE.getConstant(BaseDelta); - const SCEV *X = SE.getAddExpr(PtrSCEVA, C); - if (X == PtrSCEVB) + // All elements of the chain must have the same scalar-type size. +#ifndef NDEBUG + for (const ChainElem &E : C) + assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) == + DL.getTypeStoreSize(VecElemTy)); +#endif + + Instruction *VecInst; + if (IsLoadChain) { + // Loads get hoisted to the location of the first load in the chain. We may + // also need to hoist the (transitive) operands of the loads. + Builder.SetInsertPoint( + std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + Value *Bitcast = Builder.CreateBitCast( + getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS)); + VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); + + unsigned VecIdx = 0; + for (const ChainElem &E : C) { + Instruction *I = E.Inst; + Value *V; + Type *T = getLoadStoreType(I); + if (auto *VT = dyn_cast<FixedVectorType>(T)) { + auto Mask = llvm::to_vector<8>( + llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements())); + V = Builder.CreateShuffleVector(VecInst, Mask, I->getName()); + VecIdx += VT->getNumElements(); + } else { + V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx), + I->getName()); + ++VecIdx; + } + if (V->getType() != I->getType()) + V = Builder.CreateBitOrPointerCast(V, I->getType()); + I->replaceAllUsesWith(V); + } + + // Finally, we need to reorder the instrs in the BB so that the (transitive) + // operands of VecInst appear before it. To see why, suppose we have + // vectorized the following code: + // + // ptr1 = gep a, 1 + // load1 = load i32 ptr1 + // ptr0 = gep a, 0 + // load0 = load i32 ptr0 + // + // We will put the vectorized load at the location of the earliest load in + // the BB, i.e. load1. We get: + // + // ptr1 = gep a, 1 + // loadv = load <2 x i32> ptr0 + // load0 = extractelement loadv, 0 + // load1 = extractelement loadv, 1 + // ptr0 = gep a, 0 + // + // Notice that loadv uses ptr0, which is defined *after* it! + reorder(VecInst); + } else { + // Stores get sunk to the location of the last store in the chain. + Builder.SetInsertPoint( + std::max_element(C.begin(), C.end(), [](auto &A, auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Build the vector to store. + Value *Vec = PoisonValue::get(VecTy); + unsigned VecIdx = 0; + auto InsertElem = [&](Value *V) { + if (V->getType() != VecElemTy) + V = Builder.CreateBitOrPointerCast(V, VecElemTy); + Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++)); + }; + for (const ChainElem &E : C) { + auto I = cast<StoreInst>(E.Inst); + if (FixedVectorType *VT = + dyn_cast<FixedVectorType>(getLoadStoreType(I))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + InsertElem(Builder.CreateExtractElement(I->getValueOperand(), + Builder.getInt32(J))); + } + } else { + InsertElem(I->getValueOperand()); + } + } + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, + Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst), + VecTy->getPointerTo(AS)), + Alignment); + } + + propagateMetadata(VecInst, C); + + for (const ChainElem &E : C) + ToErase.push_back(E.Inst); + + ++NumVectorInstructions; + NumScalarsVectorized += C.size(); + return true; +} + +template <bool IsLoadChain> +bool Vectorizer::isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) { + LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " + << *ChainBegin << ")\n"); + + assert(isa<LoadInst>(ChainElem) == IsLoadChain); + if (ChainElem == ChainBegin) return true; - // The above check will not catch the cases where one of the pointers is - // factorized but the other one is not, such as (C + (S * (A + B))) vs - // (AS + BS). Get the minus scev. That will allow re-combining the expresions - // and getting the simplified difference. - const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); - if (C == Dist) + // Invariant loads can always be reordered; by definition they are not + // clobbered by stores. + if (isInvariantLoad(ChainElem)) return true; - // Sometimes even this doesn't work, because SCEV can't always see through - // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking - // things the hard way. - return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); + auto BBIt = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainElem); + else + return BasicBlock::iterator(ChainElem); + }()); + auto BBItEnd = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainBegin); + else + return BasicBlock::iterator(ChainBegin); + }()); + + const APInt &ChainElemOffset = ChainOffsets.at(ChainElem); + const unsigned ChainElemSize = + DL.getTypeStoreSize(getLoadStoreType(ChainElem)); + + for (; BBIt != BBItEnd; ++BBIt) { + Instruction *I = &*BBIt; + + if (!I->mayReadOrWriteMemory()) + continue; + + // Loads can be reordered with other loads. + if (IsLoadChain && isa<LoadInst>(I)) + continue; + + // Stores can be sunk below invariant loads. + if (!IsLoadChain && isInvariantLoad(I)) + continue; + + // If I is in the chain, we can tell whether it aliases ChainIt by checking + // what offset ChainIt accesses. This may be better than AA is able to do. + // + // We should really only have duplicate offsets for stores (the duplicate + // loads should be CSE'ed), but in case we have a duplicate load, we'll + // split the chain so we don't have to handle this case specially. + if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) { + // I and ChainElem overlap if: + // - I and ChainElem have the same offset, OR + // - I's offset is less than ChainElem's, but I touches past the + // beginning of ChainElem, OR + // - ChainElem's offset is less than I's, but ChainElem touches past the + // beginning of I. + const APInt &IOffset = OffsetIt->second; + unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I)); + if (IOffset == ChainElemOffset || + (IOffset.sle(ChainElemOffset) && + (IOffset + IElemSize).sgt(ChainElemOffset)) || + (ChainElemOffset.sle(IOffset) && + (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) { + LLVM_DEBUG({ + // Double check that AA also sees this alias. If not, we probably + // have a bug. + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); + dbgs() << "LSV: Found alias in chain: " << *I << "\n"; + }); + return false; // We found an aliasing instruction; bail. + } + + continue; // We're confident there's no alias. + } + + LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n"); + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { + LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" + << " Aliasing instruction:\n" + << " " << *I << '\n' + << " Aliased instruction and pointer:\n" + << " " << *ChainElem << '\n' + << " " << *getLoadStorePointerOperand(ChainElem) + << '\n'); + + return false; + } + } + return true; } static bool checkNoWrapFlags(Instruction *I, bool Signed) { @@ -395,10 +1097,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) { static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed) { - // If both OpA and OpB is an add with NSW/NUW and with - // one of the operands being the same, we can guarantee that the - // transformation is safe if we can prove that OpA won't overflow when - // IdxDiff added to the other operand of OpA. + LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff + << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA=" + << MatchingOpIdxA << ", AddOpB=" << *AddOpB + << ", MatchingOpIdxB=" << MatchingOpIdxB + << ", Signed=" << Signed << "\n"); + // If both OpA and OpB are adds with NSW/NUW and with one of the operands + // being the same, we can guarantee that the transformation is safe if we can + // prove that OpA won't overflow when Ret added to the other operand of OpA. // For example: // %tmp7 = add nsw i32 %tmp2, %v0 // %tmp8 = sext i32 %tmp7 to i64 @@ -407,10 +1113,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, // %tmp12 = add nsw i32 %tmp2, %tmp11 // %tmp13 = sext i32 %tmp12 to i64 // - // Both %tmp7 and %tmp2 has the nsw flag and the first operand - // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow - // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the - // nsw flag. + // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2. + // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds + // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag. assert(AddOpA->getOpcode() == Instruction::Add && AddOpB->getOpcode() == Instruction::Add && checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); @@ -461,24 +1166,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, return false; } -bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, - APInt PtrDelta, - unsigned Depth) const { +std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( + Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA + << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst + << " Depth=" << Depth << "\n"); auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); if (!GEPA || !GEPB) - return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); + return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth); // Look through GEPs after checking they're the same except for the last // index. if (GEPA->getNumOperands() != GEPB->getNumOperands() || GEPA->getPointerOperand() != GEPB->getPointerOperand()) - return false; + return std::nullopt; gep_type_iterator GTIA = gep_type_begin(GEPA); gep_type_iterator GTIB = gep_type_begin(GEPB); for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { if (GTIA.getOperand() != GTIB.getOperand()) - return false; + return std::nullopt; ++GTIA; ++GTIB; } @@ -487,23 +1194,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) - return false; + return std::nullopt; - if (PtrDelta.isNegative()) { - if (PtrDelta.isMinSignedValue()) - return false; - PtrDelta.negate(); - std::swap(OpA, OpB); - } uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); - if (PtrDelta.urem(Stride) != 0) - return false; - unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); - APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth); // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) - return false; + return std::nullopt; bool Signed = isa<SExtInst>(OpA); @@ -511,7 +1208,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Value *ValA = OpA->getOperand(0); OpB = dyn_cast<Instruction>(OpB->getOperand(0)); if (!OpB || ValA->getType() != OpB->getType()) - return false; + return std::nullopt; + + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); + const SCEV *OffsetSCEVB = SE.getSCEV(OpB); + const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); + if (IdxDiffSCEV == SE.getCouldNotCompute()) + return std::nullopt; + + ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV); + if (!IdxDiffRange.isSingleElement()) + return std::nullopt; + APInt IdxDiff = *IdxDiffRange.getSingleElement(); + + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff + << "\n"); // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; @@ -530,10 +1241,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && checkNoWrapFlags(OpB, Signed)) { - // In the checks below a matching operand in OpA and OpB is - // an operand which is the same in those two instructions. - // Below we account for possible orders of the operands of - // these add instructions. + // In the checks below a matching operand in OpA and OpB is an operand which + // is the same in those two instructions. Below we account for possible + // orders of the operands of these add instructions. for (unsigned MatchingOpIdxA : {0, 1}) for (unsigned MatchingOpIdxB : {0, 1}) if (!Safe) @@ -544,802 +1254,267 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Third attempt: - // If all set bits of IdxDiff or any higher order bit other than the sign bit - // are known to be zero in ValA, we can add Diff to it while guaranteeing no - // overflow of any sort. + // + // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher + // order bit other than the sign bit are known to be zero in ValA, we can add + // Diff to it while guaranteeing no overflow of any sort. + // + // If IdxDiff is negative, do the same, but swap ValA and ValB. if (!Safe) { + // When computing known bits, use the GEPs as context instructions, since + // they likely are in the same BB as the load/store. KnownBits Known(BitWidth); - computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT); + computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC, + ContextInst, &DT); APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); if (Signed) BitsAllowedToBeSet.clearBit(BitWidth - 1); - if (BitsAllowedToBeSet.ult(IdxDiff)) - return false; + if (BitsAllowedToBeSet.ult(IdxDiff.abs())) + return std::nullopt; + Safe = true; } - const SCEV *OffsetSCEVA = SE.getSCEV(ValA); - const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); - const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); - return X == OffsetSCEVB; + if (Safe) + return IdxDiff * Stride; + return std::nullopt; } -bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, - const APInt &PtrDelta, - unsigned Depth) const { +std::optional<APInt> Vectorizer::getConstantOffsetSelects( + Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { if (Depth++ == MaxDepth) - return false; + return std::nullopt; if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { - return SelectA->getCondition() == SelectB->getCondition() && - areConsecutivePointers(SelectA->getTrueValue(), - SelectB->getTrueValue(), PtrDelta, Depth) && - areConsecutivePointers(SelectA->getFalseValue(), - SelectB->getFalseValue(), PtrDelta, Depth); + if (SelectA->getCondition() != SelectB->getCondition()) + return std::nullopt; + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", ContextInst=" + << *ContextInst << ", Depth=" << Depth << "\n"); + std::optional<APInt> TrueDiff = getConstantOffset( + SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth); + if (!TrueDiff.has_value()) + return std::nullopt; + std::optional<APInt> FalseDiff = + getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(), + ContextInst, Depth); + if (TrueDiff == FalseDiff) + return TrueDiff; } } - return false; + return std::nullopt; } -void Vectorizer::reorder(Instruction *I) { - SmallPtrSet<Instruction *, 16> InstructionsToMove; - SmallVector<Instruction *, 16> Worklist; - - Worklist.push_back(I); - while (!Worklist.empty()) { - Instruction *IW = Worklist.pop_back_val(); - int NumOperands = IW->getNumOperands(); - for (int i = 0; i < NumOperands; i++) { - Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); - if (!IM || IM->getOpcode() == Instruction::PHI) - continue; - - // If IM is in another BB, no need to move it, because this pass only - // vectorizes instructions within one BB. - if (IM->getParent() != I->getParent()) - continue; - - if (!IM->comesBefore(I)) { - InstructionsToMove.insert(IM); - Worklist.push_back(IM); - } +EquivalenceClassMap +Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + EquivalenceClassMap Ret; + + auto getUnderlyingObject = [](const Value *Ptr) -> const Value * { + const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); + if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { + // The select's themselves are distinct instructions even if they share + // the same condition and evaluate to consecutive pointers for true and + // false values of the condition. Therefore using the select's themselves + // for grouping instructions would put consecutive accesses into different + // lists and they won't be even checked for being consecutive, and won't + // be vectorized. + return Sel->getCondition(); } - } + return ObjPtr; + }; - // All instructions to move should follow I. Start from I, not from begin(). - for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; - ++BBI) { - if (!InstructionsToMove.count(&*BBI)) + for (Instruction &I : make_range(Begin, End)) { + auto *LI = dyn_cast<LoadInst>(&I); + auto *SI = dyn_cast<StoreInst>(&I); + if (!LI && !SI) continue; - Instruction *IM = &*BBI; - --BBI; - IM->removeFromParent(); - IM->insertBefore(I); - } -} - -std::pair<BasicBlock::iterator, BasicBlock::iterator> -Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { - Instruction *C0 = Chain[0]; - BasicBlock::iterator FirstInstr = C0->getIterator(); - BasicBlock::iterator LastInstr = C0->getIterator(); - BasicBlock *BB = C0->getParent(); - unsigned NumFound = 0; - for (Instruction &I : *BB) { - if (!is_contained(Chain, &I)) + if ((LI && !LI->isSimple()) || (SI && !SI->isSimple())) continue; - ++NumFound; - if (NumFound == 1) { - FirstInstr = I.getIterator(); - } - if (NumFound == Chain.size()) { - LastInstr = I.getIterator(); - break; - } - } - - // Range is [first, last). - return std::make_pair(FirstInstr, ++LastInstr); -} - -void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { - SmallVector<Instruction *, 16> Instrs; - for (Instruction *I : Chain) { - Value *PtrOperand = getLoadStorePointerOperand(I); - assert(PtrOperand && "Instruction must have a pointer operand."); - Instrs.push_back(I); - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) - Instrs.push_back(GEP); - } - - // Erase instructions. - for (Instruction *I : Instrs) - if (I->use_empty()) - I->eraseFromParent(); -} - -std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> -Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, - unsigned ElementSizeBits) { - unsigned ElementSizeBytes = ElementSizeBits / 8; - unsigned SizeBytes = ElementSizeBytes * Chain.size(); - unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; - if (NumLeft == Chain.size()) { - if ((NumLeft & 1) == 0) - NumLeft /= 2; // Split even in half - else - --NumLeft; // Split off last element - } else if (NumLeft == 0) - NumLeft = 1; - return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); -} - -ArrayRef<Instruction *> -Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { - // These are in BB order, unlike Chain, which is in address order. - SmallVector<Instruction *, 16> MemoryInstrs; - SmallVector<Instruction *, 16> ChainInstrs; - - bool IsLoadChain = isa<LoadInst>(Chain[0]); - LLVM_DEBUG({ - for (Instruction *I : Chain) { - if (IsLoadChain) - assert(isa<LoadInst>(I) && - "All elements of Chain must be loads, or all must be stores."); - else - assert(isa<StoreInst>(I) && - "All elements of Chain must be loads, or all must be stores."); - } - }); - - for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { - if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) { - ChainInstrs.push_back(&I); + if ((LI && !TTI.isLegalToVectorizeLoad(LI)) || + (SI && !TTI.isLegalToVectorizeStore(SI))) continue; - } - if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { - LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " - << I << '\n'); - break; - } - if (I.mayReadOrWriteMemory()) - MemoryInstrs.push_back(&I); - } - - // Loop until we find an instruction in ChainInstrs that we can't vectorize. - unsigned ChainInstrIdx = 0; - Instruction *BarrierMemoryInstr = nullptr; - - for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { - Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; - - // If a barrier memory instruction was found, chain instructions that follow - // will not be added to the valid prefix. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) - break; - // Check (in BB order) if any instruction prevents ChainInstr from being - // vectorized. Find and store the first such "conflicting" instruction. - for (Instruction *MemInstr : MemoryInstrs) { - // If a barrier memory instruction was found, do not check past it. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) - break; - - auto *MemLoad = dyn_cast<LoadInst>(MemInstr); - auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); - if (MemLoad && ChainLoad) - continue; - - // We can ignore the alias if the we have a load store pair and the load - // is known to be invariant. The load cannot be clobbered by the store. - auto IsInvariantLoad = [](const LoadInst *LI) -> bool { - return LI->hasMetadata(LLVMContext::MD_invariant_load); - }; - - if (IsLoadChain) { - // We can ignore the alias as long as the load comes before the store, - // because that means we won't be moving the load past the store to - // vectorize it (the vectorized load is inserted at the location of the - // first load in the chain). - if (ChainInstr->comesBefore(MemInstr) || - (ChainLoad && IsInvariantLoad(ChainLoad))) - continue; - } else { - // Same case, but in reverse. - if (MemInstr->comesBefore(ChainInstr) || - (MemLoad && IsInvariantLoad(MemLoad))) - continue; - } - - ModRefInfo MR = - AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr)); - if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { - LLVM_DEBUG({ - dbgs() << "LSV: Found alias:\n" - " Aliasing instruction:\n" - << " " << *MemInstr << '\n' - << " Aliased instruction and pointer:\n" - << " " << *ChainInstr << '\n' - << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; - }); - // Save this aliasing memory instruction as a barrier, but allow other - // instructions that precede the barrier to be vectorized with this one. - BarrierMemoryInstr = MemInstr; - break; - } - } - // Continue the search only for store chains, since vectorizing stores that - // precede an aliasing load is valid. Conversely, vectorizing loads is valid - // up to an aliasing store, but should not pull loads from further down in - // the basic block. - if (IsLoadChain && BarrierMemoryInstr) { - // The BarrierMemoryInstr is a store that precedes ChainInstr. - assert(BarrierMemoryInstr->comesBefore(ChainInstr)); - break; - } - } - - // Find the largest prefix of Chain whose elements are all in - // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of - // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB - // order.) - SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( - ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); - unsigned ChainIdx = 0; - for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { - if (!VectorizableChainInstrs.count(Chain[ChainIdx])) - break; - } - return Chain.slice(0, ChainIdx); -} - -static ChainID getChainID(const Value *Ptr) { - const Value *ObjPtr = getUnderlyingObject(Ptr); - if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { - // The select's themselves are distinct instructions even if they share the - // same condition and evaluate to consecutive pointers for true and false - // values of the condition. Therefore using the select's themselves for - // grouping instructions would put consecutive accesses into different lists - // and they won't be even checked for being consecutive, and won't be - // vectorized. - return Sel->getCondition(); - } - return ObjPtr; -} - -std::pair<InstrListMap, InstrListMap> -Vectorizer::collectInstructions(BasicBlock *BB) { - InstrListMap LoadRefs; - InstrListMap StoreRefs; - - for (Instruction &I : *BB) { - if (!I.mayReadOrWriteMemory()) + Type *Ty = getLoadStoreType(&I); + if (!VectorType::isValidElementType(Ty->getScalarType())) continue; - if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { - if (!LI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeLoad(LI)) - continue; - - Type *Ty = LI->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - Value *Ptr = LI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast<VectorType>(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save the load locations. - const ChainID ID = getChainID(Ptr); - LoadRefs[ID].push_back(LI); - } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { - if (!SI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeStore(SI)) - continue; - - Type *Ty = SI->getValueOperand()->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - Value *Ptr = SI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast<VectorType>(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save store location. - const ChainID ID = getChainID(Ptr); - StoreRefs[ID].push_back(SI); - } - } - - return {LoadRefs, StoreRefs}; -} - -bool Vectorizer::vectorizeChains(InstrListMap &Map) { - bool Changed = false; - - for (const std::pair<ChainID, InstrList> &Chain : Map) { - unsigned Size = Chain.second.size(); - if (Size < 2) + // Skip weird non-byte sizes. They probably aren't worth the effort of + // handling correctly. + unsigned TySize = DL.getTypeSizeInBits(Ty); + if ((TySize % 8) != 0) continue; - LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); - - // Process the stores in chunks of 64. - for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { - unsigned Len = std::min<unsigned>(CE - CI, 64); - ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); - Changed |= vectorizeInstructions(Chunk); - } - } - - return Changed; -} - -bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { - LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() - << " instructions.\n"); - SmallVector<int, 16> Heads, Tails; - int ConsecutiveChain[64]; - - // Do a quadratic search on all of the given loads/stores and find all of the - // pairs of loads/stores that follow each other. - for (int i = 0, e = Instrs.size(); i < e; ++i) { - ConsecutiveChain[i] = -1; - for (int j = e - 1; j >= 0; --j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Instrs[i], Instrs[j])) { - if (ConsecutiveChain[i] != -1) { - int CurDistance = std::abs(ConsecutiveChain[i] - i); - int NewDistance = std::abs(ConsecutiveChain[i] - j); - if (j < i || NewDistance > CurDistance) - continue; // Should not insert. - } + // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain + // functions are currently using an integer type for the vectorized + // load/store, and does not support casting between the integer type and a + // vector of pointers (e.g. i64 to <2 x i16*>) + if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) + continue; - Tails.push_back(j); - Heads.push_back(i); - ConsecutiveChain[i] = j; - } - } - } + Value *Ptr = getLoadStorePointerOperand(&I); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - bool Changed = false; - SmallPtrSet<Instruction *, 16> InstructionsProcessed; + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast<VectorType>(Ty); - for (int Head : Heads) { - if (InstructionsProcessed.count(Instrs[Head])) + // Only handle power-of-two sized elements. + if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) || + (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType())))) continue; - bool LongerChainExists = false; - for (unsigned TIt = 0; TIt < Tails.size(); TIt++) - if (Head == Tails[TIt] && - !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - LongerChainExists = true; - break; - } - if (LongerChainExists) - continue; - - // We found an instr that starts a chain. Now follow the chain and try to - // vectorize it. - SmallVector<Instruction *, 16> Operands; - int I = Head; - while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { - if (InstructionsProcessed.count(Instrs[I])) - break; - - Operands.push_back(Instrs[I]); - I = ConsecutiveChain[I]; - } - bool Vectorized = false; - if (isa<LoadInst>(*Operands.begin())) - Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); - else - Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); + // No point in looking at these if they're too big to vectorize. + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) + continue; - Changed |= Vectorized; + Ret[{getUnderlyingObject(Ptr), AS, + DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), + /*IsLoad=*/LI != nullptr}] + .push_back(&I); } - return Changed; + return Ret; } -bool Vectorizer::vectorizeStoreChain( - ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { - StoreInst *S0 = cast<StoreInst>(Chain[0]); - - // If the vector has an int element, default to int for the whole store. - Type *StoreTy = nullptr; - for (Instruction *I : Chain) { - StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); - if (StoreTy->isIntOrIntVectorTy()) - break; - - if (StoreTy->isPtrOrPtrVectorTy()) { - StoreTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(StoreTy)); - break; - } - } - assert(StoreTy && "Failed to find store type"); +std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) { + if (Instrs.empty()) + return {}; - unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned AS = S0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = S0->getAlign(); + unsigned AS = getLoadStoreAddressSpace(Instrs[0]); + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; +#ifndef NDEBUG + // Check that Instrs is in BB order and all have the same addr space. + for (size_t I = 1; I < Instrs.size(); ++I) { + assert(Instrs[I - 1]->comesBefore(Instrs[I])); + assert(getLoadStoreAddressSpace(Instrs[I]) == AS); } +#endif - ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } - - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - - FixedVectorType *VecTy; - auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); - if (VecStoreTy) - VecTy = FixedVectorType::get(StoreTy->getScalarType(), - Chain.size() * VecStoreTy->getNumElements()); - else - VecTy = FixedVectorType::get(StoreTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; - } - - LLVM_DEBUG({ - dbgs() << "LSV: Stores to vectorize:\n"; - for (Instruction *I : Chain) - dbgs() << " " << *I << "\n"; - }); - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the store is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; + // Machinery to build an MRU-hashtable of Chains. + // + // (Ideally this could be done with MapVector, but as currently implemented, + // moving an element to the front of a MapVector is O(n).) + struct InstrListElem : ilist_node<InstrListElem>, + std::pair<Instruction *, Chain> { + explicit InstrListElem(Instruction *I) + : std::pair<Instruction *, Chain>(I, {}) {} + }; + struct InstrListElemDenseMapInfo { + using PtrInfo = DenseMapInfo<InstrListElem *>; + using IInfo = DenseMapInfo<Instruction *>; + static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); } + static InstrListElem *getTombstoneKey() { + return PtrInfo::getTombstoneKey(); } - - Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, S0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; - } - - if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; - } - - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*Last); - - Value *Vec = PoisonValue::get(VecTy); - - if (VecStoreTy) { - unsigned VecWidth = VecStoreTy->getNumElements(); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast<StoreInst>(Chain[I]); - for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { - unsigned NewIdx = J + I * VecWidth; - Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), - Builder.getInt32(J)); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); - Vec = Insert; - } + static unsigned getHashValue(const InstrListElem *E) { + return IInfo::getHashValue(E->first); } - } else { - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast<StoreInst>(Chain[I]); - Value *Extract = Store->getValueOperand(); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = - Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); - Vec = Insert; + static bool isEqual(const InstrListElem *A, const InstrListElem *B) { + if (A == getEmptyKey() || B == getEmptyKey()) + return A == getEmptyKey() && B == getEmptyKey(); + if (A == getTombstoneKey() || B == getTombstoneKey()) + return A == getTombstoneKey() && B == getTombstoneKey(); + return IInfo::isEqual(A->first, B->first); } - } - - StoreInst *SI = Builder.CreateAlignedStore( - Vec, - Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), - Alignment); - propagateMetadata(SI, Chain); - - eraseInstructions(Chain); - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::vectorizeLoadChain( - ArrayRef<Instruction *> Chain, - SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { - LoadInst *L0 = cast<LoadInst>(Chain[0]); - - // If the vector has an int element, default to int for the whole load. - Type *LoadTy = nullptr; - for (const auto &V : Chain) { - LoadTy = cast<LoadInst>(V)->getType(); - if (LoadTy->isIntOrIntVectorTy()) - break; - - if (LoadTy->isPtrOrPtrVectorTy()) { - LoadTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(LoadTy)); - break; + }; + SpecificBumpPtrAllocator<InstrListElem> Allocator; + simple_ilist<InstrListElem> MRU; + DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains; + + // Compare each instruction in `instrs` to leader of the N most recently-used + // chains. This limits the O(n^2) behavior of this pass while also allowing + // us to build arbitrarily long chains. + for (Instruction *I : Instrs) { + constexpr int MaxChainsToTry = 64; + + bool MatchFound = false; + auto ChainIter = MRU.begin(); + for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end(); + ++J, ++ChainIter) { + std::optional<APInt> Offset = getConstantOffset( + getLoadStorePointerOperand(ChainIter->first), + getLoadStorePointerOperand(I), + /*ContextInst=*/ + (ChainIter->first->comesBefore(I) ? I : ChainIter->first)); + if (Offset.has_value()) { + // `Offset` might not have the expected number of bits, if e.g. AS has a + // different number of bits than opaque pointers. + ChainIter->second.push_back(ChainElem{I, Offset.value()}); + // Move ChainIter to the front of the MRU list. + MRU.remove(*ChainIter); + MRU.push_front(*ChainIter); + MatchFound = true; + break; + } } - } - assert(LoadTy && "Can't determine LoadInst type from chain"); - - unsigned Sz = DL.getTypeSizeInBits(LoadTy); - unsigned AS = L0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = L0->getAlign(); - - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - - ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - VectorType *VecTy; - auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); - if (VecLoadTy) - VecTy = FixedVectorType::get(LoadTy->getScalarType(), - Chain.size() * VecLoadTy->getNumElements()); - else - VecTy = FixedVectorType::get(LoadTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; - } - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the load is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; + if (!MatchFound) { + APInt ZeroOffset(ASPtrBits, 0); + InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I); + E->second.push_back(ChainElem{I, ZeroOffset}); + MRU.push_front(*E); + Chains.insert(E); } - - Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, L0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; } - if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; - } + std::vector<Chain> Ret; + Ret.reserve(Chains.size()); + // Iterate over MRU rather than Chains so the order is deterministic. + for (auto &E : MRU) + if (E.second.size() > 1) + Ret.push_back(std::move(E.second)); + return Ret; +} - LLVM_DEBUG({ - dbgs() << "LSV: Loads to vectorize:\n"; - for (Instruction *I : Chain) - I->dump(); - }); +std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, + Instruction *ContextInst, + unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst + << ", Depth=" << Depth << "\n"); + // We'll ultimately return a value of this bit width, even if computations + // happen in a different width. + unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); + APInt OffsetA(OrigBitWidth, 0); + APInt OffsetB(OrigBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + return std::nullopt; - // getVectorizablePrefix already computed getBoundaryInstrs. The value of - // Last may have changed since then, but the value of First won't have. If it - // matters, we could compute getBoundaryInstrs only once and reuse it here. - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*First); - - Value *Bitcast = - Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - LoadInst *LI = - Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); - propagateMetadata(LI, Chain); - - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *CV = Chain[I]; - Value *V; - if (VecLoadTy) { - // Extract a subvector using shufflevector. - unsigned VecWidth = VecLoadTy->getNumElements(); - auto Mask = - llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth)); - V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); - } else { - V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); - } + // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets + // should properly handle a possible overflow and the value should fit into + // the smallest data type used in the cast/gep chain. + assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && + OffsetB.getSignificantBits() <= NewPtrBitWidth); - if (V->getType() != CV->getType()) { - V = Builder.CreateBitOrPointerCast(V, CV->getType()); + OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); + OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); + if (PtrA == PtrB) + return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth); + + // Try to compute B - A. + const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA)); + if (DistScev != SE.getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n"); + ConstantRange DistRange = SE.getSignedRange(DistScev); + if (DistRange.isSingleElement()) { + // Handle index width (the width of Dist) != pointer width (the width of + // the Offset*s at this point). + APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth); + return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth); } - - // Replace the old instruction. - CV->replaceAllUsesWith(V); } - - // Since we might have opaque pointers we might end up using the pointer - // operand of the first load (wrt. memory loaded) for the vector load. Since - // this first load might not be the first in the block we potentially need to - // reorder the pointer operand (and its operands). If we have a bitcast though - // it might be before the load and should be the reorder start instruction. - // "Might" because for opaque pointers the "bitcast" is just the first loads - // pointer operand, as oppposed to something we inserted at the right position - // ourselves. - Instruction *BCInst = dyn_cast<Instruction>(Bitcast); - reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI); - - eraseInstructions(Chain); - - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed) { - RelativeSpeed = 0; - if (Alignment.value() % SzInBytes == 0) - return false; - - bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), - SzInBytes * 8, AddressSpace, - Alignment, &RelativeSpeed); - LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows - << " with relative speed = " << RelativeSpeed << '\n';); - return !Allows || !RelativeSpeed; + std::optional<APInt> Diff = + getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth); + if (Diff.has_value()) + return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth())) + .sextOrTrunc(OrigBitWidth); + return std::nullopt; } |