aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-07-26 19:03:47 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-07-26 19:04:23 +0000
commit7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
parente3b557809604d036af6e00c60f012c2025b59a5e (diff)
downloadsrc-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz
src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2049
1 files changed, 1112 insertions, 937 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 0b7fc853dc1b..260d7889906b 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -37,13 +37,34 @@
// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
// could use this pass (with some modifications), but currently it implements
// its own pass to do something similar to what we do here.
+//
+// Overview of the algorithm and terminology in this pass:
+//
+// - Break up each basic block into pseudo-BBs, composed of instructions which
+// are guaranteed to transfer control to their successors.
+// - Within a single pseudo-BB, find all loads, and group them into
+// "equivalence classes" according to getUnderlyingObject() and loaded
+// element size. Do the same for stores.
+// - For each equivalence class, greedily build "chains". Each chain has a
+// leader instruction, and every other member of the chain has a known
+// constant offset from the first instr in the chain.
+// - Break up chains so that they contain only contiguous accesses of legal
+// size with no intervening may-alias instrs.
+// - Convert each chain to vector instructions.
+//
+// The O(n^2) behavior of this pass comes from initially building the chains.
+// In the worst case we have to compare each new instruction to all of those
+// that came before. To limit this, we only calculate the offset to the leaders
+// of the N most recently-used chains.
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -57,6 +78,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -67,23 +89,33 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
+#include <cstdint>
#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <optional>
#include <tuple>
+#include <type_traits>
#include <utility>
+#include <vector>
using namespace llvm;
@@ -92,21 +124,115 @@ using namespace llvm;
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+namespace {
+
+// Equivalence class key, the initial tuple by which we group loads/stores.
+// Loads/stores with different EqClassKeys are never merged.
+//
+// (We could in theory remove element-size from the this tuple. We'd just need
+// to fix up the vector packing/unpacking code.)
+using EqClassKey =
+ std::tuple<const Value * /* result of getUnderlyingObject() */,
+ unsigned /* AddrSpace */,
+ unsigned /* Load/Store element size bits */,
+ char /* IsLoad; char b/c bool can't be a DenseMap key */
+ >;
+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const EqClassKey &K) {
+ const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
+ return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
+ << " of element size " << ElementSize << " bits in addrspace "
+ << AddrSpace;
+}
+
+// A Chain is a set of instructions such that:
+// - All instructions have the same equivalence class, so in particular all are
+// loads, or all are stores.
+// - We know the address accessed by the i'th chain elem relative to the
+// chain's leader instruction, which is the first instr of the chain in BB
+// order.
+//
+// Chains have two canonical orderings:
+// - BB order, sorted by Instr->comesBefore.
+// - Offset order, sorted by OffsetFromLeader.
+// This pass switches back and forth between these orders.
+struct ChainElem {
+ Instruction *Inst;
+ APInt OffsetFromLeader;
+};
+using Chain = SmallVector<ChainElem, 1>;
+
+void sortChainInBBOrder(Chain &C) {
+ sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
+}
+
+void sortChainInOffsetOrder(Chain &C) {
+ sort(C, [](const auto &A, const auto &B) {
+ if (A.OffsetFromLeader != B.OffsetFromLeader)
+ return A.OffsetFromLeader.slt(B.OffsetFromLeader);
+ return A.Inst->comesBefore(B.Inst); // stable tiebreaker
+ });
+}
+
+[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
+ for (const auto &E : C) {
+ dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
+ }
+}
+
+using EquivalenceClassMap =
+ MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
+
// FIXME: Assuming stack alignment of 4 is always good enough
-static const unsigned StackAdjustedAlignment = 4;
+constexpr unsigned StackAdjustedAlignment = 4;
-namespace {
+Instruction *propagateMetadata(Instruction *I, const Chain &C) {
+ SmallVector<Value *, 8> Values;
+ for (const ChainElem &E : C)
+ Values.push_back(E.Inst);
+ return propagateMetadata(I, Values);
+}
-/// ChainID is an arbitrary token that is allowed to be different only for the
-/// accesses that are guaranteed to be considered non-consecutive by
-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
-/// together and reducing the number of instructions the main search operates on
-/// at a time, i.e. this is to reduce compile time and nothing else as the main
-/// search has O(n^2) time complexity. The underlying type of ChainID should not
-/// be relied upon.
-using ChainID = const Value *;
-using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<ChainID, InstrList>;
+bool isInvariantLoad(const Instruction *I) {
+ const LoadInst *LI = dyn_cast<LoadInst>(I);
+ return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
+}
+
+/// Reorders the instructions that I depends on (the instructions defining its
+/// operands), to ensure they dominate I.
+void reorder(Instruction *I) {
+ SmallPtrSet<Instruction *, 16> InstructionsToMove;
+ SmallVector<Instruction *, 16> Worklist;
+
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *IW = Worklist.pop_back_val();
+ int NumOperands = IW->getNumOperands();
+ for (int i = 0; i < NumOperands; i++) {
+ Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+ if (!IM || IM->getOpcode() == Instruction::PHI)
+ continue;
+
+ // If IM is in another BB, no need to move it, because this pass only
+ // vectorizes instructions within one BB.
+ if (IM->getParent() != I->getParent())
+ continue;
+
+ if (!IM->comesBefore(I)) {
+ InstructionsToMove.insert(IM);
+ Worklist.push_back(IM);
+ }
+ }
+ }
+
+ // All instructions to move should follow I. Start from I, not from begin().
+ for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
+ Instruction *IM = &*(BBI++);
+ if (!InstructionsToMove.count(IM))
+ continue;
+ IM->moveBefore(I);
+ }
+}
class Vectorizer {
Function &F;
@@ -118,6 +244,12 @@ class Vectorizer {
const DataLayout &DL;
IRBuilder<> Builder;
+ // We could erase instrs right after vectorizing them, but that can mess up
+ // our BB iterators, and also can make the equivalence class keys point to
+ // freed memory. This is fixable, but it's simpler just to wait until we're
+ // done with the BB and erase all at once.
+ SmallVector<Instruction *, 128> ToErase;
+
public:
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
@@ -127,70 +259,83 @@ public:
bool run();
private:
- unsigned getPointerAddressSpace(Value *I);
-
static const unsigned MaxDepth = 3;
- bool isConsecutiveAccess(Value *A, Value *B);
- bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth = 0) const;
- bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth) const;
- bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
- unsigned Depth) const;
-
- /// After vectorization, reorder the instructions that I depends on
- /// (the instructions defining its operands), to ensure they dominate I.
- void reorder(Instruction *I);
-
- /// Returns the first and the last instructions in Chain.
- std::pair<BasicBlock::iterator, BasicBlock::iterator>
- getBoundaryInstrs(ArrayRef<Instruction *> Chain);
-
- /// Erases the original instructions after vectorizing.
- void eraseInstructions(ArrayRef<Instruction *> Chain);
-
- /// "Legalize" the vector type that would be produced by combining \p
- /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
- /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
- /// expected to have more than 4 elements.
- std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
- splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
-
- /// Finds the largest prefix of Chain that's vectorizable, checking for
- /// intervening instructions which may affect the memory accessed by the
- /// instructions within Chain.
+ /// Runs the vectorizer on a "pseudo basic block", which is a range of
+ /// instructions [Begin, End) within one BB all of which have
+ /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
+ bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
+
+ /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
+ /// in the same BB with the same value for getUnderlyingObject() etc.
+ bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass);
+
+ /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
+ /// where all instructions access a known, constant offset from the first
+ /// instruction.
+ bool runOnChain(Chain &C);
+
+ /// Splits the chain into subchains of instructions which read/write a
+ /// contiguous block of memory. Discards any length-1 subchains (because
+ /// there's nothing to vectorize in there).
+ std::vector<Chain> splitChainByContiguity(Chain &C);
+
+ /// Splits the chain into subchains where it's safe to hoist loads up to the
+ /// beginning of the sub-chain and it's safe to sink loads up to the end of
+ /// the sub-chain. Discards any length-1 subchains.
+ std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
+
+ /// Splits the chain into subchains that make legal, aligned accesses.
+ /// Discards any length-1 subchains.
+ std::vector<Chain> splitChainByAlignment(Chain &C);
+
+ /// Converts the instrs in the chain into a single vectorized load or store.
+ /// Adds the old scalar loads/stores to ToErase.
+ bool vectorizeChain(Chain &C);
+
+ /// Tries to compute the offset in bytes PtrB - PtrA.
+ std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth = 0);
+ std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth);
+ std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth);
+
+ /// Gets the element type of the vector that the chain will load or store.
+ /// This is nontrivial because the chain may contain elements of different
+ /// types; e.g. it's legal to have a chain that contains both i32 and float.
+ Type *getChainElemTy(const Chain &C);
+
+ /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
+ /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
+ /// instructions.
+ ///
+ /// The map ChainElemOffsets must contain all of the elements in
+ /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
+ /// address. It's ok if it contains additional entries.
+ template <bool IsLoadChain>
+ bool isSafeToMove(
+ Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+
+ /// Collects loads and stores grouped by "equivalence class", where:
+ /// - all elements in an eq class are a load or all are a store,
+ /// - they all load/store the same element size (it's OK to have e.g. i8 and
+ /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
+ /// - they all have the same value for getUnderlyingObject().
+ EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
+ BasicBlock::iterator End);
+
+ /// Partitions Instrs into "chains" where every instruction has a known
+ /// constant offset from the first instr in the chain.
///
- /// The elements of \p Chain must be all loads or all stores and must be in
- /// address order.
- ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
-
- /// Collects load and store instructions to vectorize.
- std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
-
- /// Processes the collected instructions, the \p Map. The values of \p Map
- /// should be all loads or all stores.
- bool vectorizeChains(InstrListMap &Map);
-
- /// Finds the load/stores to consecutive memory addresses and vectorizes them.
- bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
-
- /// Vectorizes the load instructions in Chain.
- bool
- vectorizeLoadChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Vectorizes the store instructions in Chain.
- bool
- vectorizeStoreChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Check if this load/store access is misaligned accesses.
- /// Returns a \p RelativeSpeed of an operation if allowed suitable to
- /// compare to another result for the same \p AddressSpace and potentially
- /// different \p Alignment and \p SzInBytes.
- bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- Align Alignment, unsigned &RelativeSpeed);
+ /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
+ /// in the chain is the leader, and an instr touches distance 0 from itself.
+ std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -198,7 +343,8 @@ public:
static char ID;
LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
- initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ initializeLoadStoreVectorizerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -250,11 +396,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- Vectorizer V(F, AA, AC, DT, SE, TTI);
- return V.run();
+ return Vectorizer(F, AA, AC, DT, SE, TTI).run();
}
-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return PreservedAnalyses::all();
@@ -265,125 +411,681 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
- Vectorizer V(F, AA, AC, DT, SE, TTI);
- bool Changed = V.run();
+ bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return Changed ? PA : PreservedAnalyses::all();
}
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
-// vectors of Instructions.
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
- SmallVector<Value *, 8> VL(IL.begin(), IL.end());
- propagateMetadata(I, VL);
-}
-
-// Vectorizer Implementation
bool Vectorizer::run() {
bool Changed = false;
-
- // Scan the blocks in the function in post order.
+ // Break up the BB if there are any instrs which aren't guaranteed to transfer
+ // execution to their successor.
+ //
+ // Consider, for example:
+ //
+ // def assert_arr_len(int n) { if (n < 2) exit(); }
+ //
+ // load arr[0]
+ // call assert_array_len(arr.length)
+ // load arr[1]
+ //
+ // Even though assert_arr_len does not read or write any memory, we can't
+ // speculate the second load before the call. More info at
+ // https://github.com/llvm/llvm-project/issues/52950.
for (BasicBlock *BB : post_order(&F)) {
- InstrListMap LoadRefs, StoreRefs;
- std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
- Changed |= vectorizeChains(LoadRefs);
- Changed |= vectorizeChains(StoreRefs);
+ // BB must at least have a terminator.
+ assert(!BB->empty());
+
+ SmallVector<BasicBlock::iterator, 8> Barriers;
+ Barriers.push_back(BB->begin());
+ for (Instruction &I : *BB)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ Barriers.push_back(I.getIterator());
+ Barriers.push_back(BB->end());
+
+ for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
+ ++It)
+ Changed |= runOnPseudoBB(*It, *std::next(It));
+
+ for (Instruction *I : ToErase) {
+ auto *PtrOperand = getLoadStorePointerOperand(I);
+ if (I->use_empty())
+ I->eraseFromParent();
+ RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+ }
+ ToErase.clear();
}
return Changed;
}
-unsigned Vectorizer::getPointerAddressSpace(Value *I) {
- if (LoadInst *L = dyn_cast<LoadInst>(I))
- return L->getPointerAddressSpace();
- if (StoreInst *S = dyn_cast<StoreInst>(I))
- return S->getPointerAddressSpace();
- return -1;
+bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
+ BasicBlock::iterator End) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
+ if (End != Begin->getParent()->end())
+ dbgs() << *End;
+ else
+ dbgs() << "<BB end>";
+ dbgs() << ")\n";
+ });
+
+ bool Changed = false;
+ for (const auto &[EqClassKey, EqClass] :
+ collectEquivalenceClasses(Begin, End))
+ Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+
+ return Changed;
}
-// FIXME: Merge with llvm::isConsecutiveAccess
-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
- Value *PtrA = getLoadStorePointerOperand(A);
- Value *PtrB = getLoadStorePointerOperand(B);
- unsigned ASA = getPointerAddressSpace(A);
- unsigned ASB = getPointerAddressSpace(B);
+bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass) {
+ bool Changed = false;
- // Check that the address spaces match and that the pointers are valid.
- if (!PtrA || !PtrB || (ASA != ASB))
- return false;
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+ << " keyed on " << EqClassKey << ":\n";
+ for (Instruction *I : EqClass)
+ dbgs() << " " << *I << "\n";
+ });
- // Make sure that A and B are different pointers of the same size type.
- Type *PtrATy = getLoadStoreType(A);
- Type *PtrBTy = getLoadStoreType(B);
- if (PtrA == PtrB ||
- PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
- DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
- DL.getTypeStoreSize(PtrATy->getScalarType()) !=
- DL.getTypeStoreSize(PtrBTy->getScalarType()))
- return false;
+ std::vector<Chain> Chains = gatherChains(EqClass);
+ LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
+ << " nontrivial chains.\n";);
+ for (Chain &C : Chains)
+ Changed |= runOnChain(C);
+ return Changed;
+}
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
- APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+bool Vectorizer::runOnChain(Chain &C) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
+ dumpChain(C);
+ });
- return areConsecutivePointers(PtrA, PtrB, Size);
+ // Split up the chain into increasingly smaller chains, until we can finally
+ // vectorize the chains.
+ //
+ // (Don't be scared by the depth of the loop nest here. These operations are
+ // all at worst O(n lg n) in the number of instructions, and splitting chains
+ // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
+ bool Changed = false;
+ for (auto &C : splitChainByMayAliasInstrs(C))
+ for (auto &C : splitChainByContiguity(C))
+ for (auto &C : splitChainByAlignment(C))
+ Changed |= vectorizeChain(C);
+ return Changed;
}
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
- APInt PtrDelta, unsigned Depth) const {
- unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
- APInt OffsetA(PtrBitWidth, 0);
- APInt OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
+ if (C.empty())
+ return {};
- unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+ sortChainInBBOrder(C);
- if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
+ dumpChain(C);
+ });
+
+ // We know that elements in the chain with nonverlapping offsets can't
+ // alias, but AA may not be smart enough to figure this out. Use a
+ // hashtable.
+ DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
+ for (const auto &E : C)
+ ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
+
+ // Loads get hoisted up to the first load in the chain. Stores get sunk
+ // down to the last store in the chain. Our algorithm for loads is:
+ //
+ // - Take the first element of the chain. This is the start of a new chain.
+ // - Take the next element of `Chain` and check for may-alias instructions
+ // up to the start of NewChain. If no may-alias instrs, add it to
+ // NewChain. Otherwise, start a new NewChain.
+ //
+ // For stores it's the same except in the reverse direction.
+ //
+ // We expect IsLoad to be an std::bool_constant.
+ auto Impl = [&](auto IsLoad) {
+ // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
+ auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
+ if constexpr (IsLoad())
+ return std::make_pair(C.begin(), C.end());
+ else
+ return std::make_pair(C.rbegin(), C.rend());
+ }(IsLoad);
+ assert(ChainBegin != ChainEnd);
+
+ std::vector<Chain> Chains;
+ SmallVector<ChainElem, 1> NewChain;
+ NewChain.push_back(*ChainBegin);
+ for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
+ if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
+ ChainOffsets)) {
+ LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
+ << *ChainIt->Inst << " into " << *ChainBegin->Inst
+ << "\n");
+ NewChain.push_back(*ChainIt);
+ } else {
+ LLVM_DEBUG(
+ dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
+ << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
+ if (NewChain.size() > 1) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+ dumpChain(NewChain);
+ });
+ Chains.push_back(std::move(NewChain));
+ }
+
+ // Start a new chain.
+ NewChain = SmallVector<ChainElem, 1>({*ChainIt});
+ }
+ }
+ if (NewChain.size() > 1) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+ dumpChain(NewChain);
+ });
+ Chains.push_back(std::move(NewChain));
+ }
+ return Chains;
+ };
+
+ if (isa<LoadInst>(C[0].Inst))
+ return Impl(/*IsLoad=*/std::bool_constant<true>());
+
+ assert(isa<StoreInst>(C[0].Inst));
+ return Impl(/*IsLoad=*/std::bool_constant<false>());
+}
+
+std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
+ if (C.empty())
+ return {};
+
+ sortChainInOffsetOrder(C);
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByContiguity considering chain:\n";
+ dumpChain(C);
+ });
+
+ std::vector<Chain> Ret;
+ Ret.push_back({C.front()});
+
+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+ // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
+ auto &CurChain = Ret.back();
+ const ChainElem &Prev = CurChain.back();
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+ "collectEquivalenceClass");
+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+
+ // Add this instruction to the end of the current chain, or start a new one.
+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+ << (AreContiguous ? "" : "not ") << "contiguous: "
+ << *Prev.Inst << " (ends at offset " << PrevReadEnd
+ << ") -> " << *It->Inst << " (starts at offset "
+ << It->OffsetFromLeader << ")\n");
+ if (AreContiguous)
+ CurChain.push_back(*It);
+ else
+ Ret.push_back({*It});
+ }
+
+ // Filter out length-1 chains, these are uninteresting.
+ llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
+ return Ret;
+}
+
+Type *Vectorizer::getChainElemTy(const Chain &C) {
+ assert(!C.empty());
+ // The rules are:
+ // - If there are any pointer types in the chain, use an integer type.
+ // - Prefer an integer type if it appears in the chain.
+ // - Otherwise, use the first type in the chain.
+ //
+ // The rule about pointer types is a simplification when we merge e.g. a load
+ // of a ptr and a double. There's no direct conversion from a ptr to a
+ // double; it requires a ptrtoint followed by a bitcast.
+ //
+ // It's unclear to me if the other rules have any practical effect, but we do
+ // it to match this pass's previous behavior.
+ if (any_of(C, [](const ChainElem &E) {
+ return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
+ })) {
+ return Type::getIntNTy(
+ F.getContext(),
+ DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
+ }
+
+ for (const ChainElem &E : C)
+ if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
+ return T;
+ return getLoadStoreType(C[0].Inst)->getScalarType();
+}
+
+std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
+ // We use a simple greedy algorithm.
+ // - Given a chain of length N, find all prefixes that
+ // (a) are not longer than the max register length, and
+ // (b) are a power of 2.
+ // - Starting from the longest prefix, try to create a vector of that length.
+ // - If one of them works, great. Repeat the algorithm on any remaining
+ // elements in the chain.
+ // - If none of them work, discard the first element and repeat on a chain
+ // of length N-1.
+ if (C.empty())
+ return {};
+
+ sortChainInOffsetOrder(C);
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: splitChainByAlignment considering chain:\n";
+ dumpChain(C);
+ });
+
+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+ auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
+ unsigned ChainSizeBytes, VectorType *VecTy) {
+ return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
+ ChainSizeBytes, VecTy)
+ : TTI.getStoreVectorFactor(VF, LoadStoreSize,
+ ChainSizeBytes, VecTy);
+ };
+
+#ifndef NDEBUG
+ for (const auto &E : C) {
+ Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+ assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
+ "Should have filtered out non-power-of-two elements in "
+ "collectEquivalenceClasses.");
+ }
+#endif
+
+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+ unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+
+ std::vector<Chain> Ret;
+ for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
+ // Find candidate chains of size not greater than the largest vector reg.
+ // These chains are over the closed interval [CBegin, CEnd].
+ SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
+ CandidateChains;
+ for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
+ APInt Sz = C[CEnd].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
+ C[CBegin].OffsetFromLeader;
+ if (Sz.sgt(VecRegBytes))
+ break;
+ CandidateChains.push_back(
+ {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
+ }
+
+ // Consider the longest chain first.
+ for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
+ It != End; ++It) {
+ auto [CEnd, SizeBytes] = *It;
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
+ << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
+
+ Type *VecElemTy = getChainElemTy(C);
+ // Note, VecElemTy is a power of 2, but might be less than one byte. For
+ // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
+ // VecElemTy would be i4.
+ unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
+
+ // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
+ assert((8 * SizeBytes) % VecElemBits == 0);
+ unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
+ FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
+ unsigned VF = 8 * VecRegBytes / VecElemBits;
+
+ // Check that TTI is happy with this vectorization factor.
+ unsigned TargetVF = getVectorFactor(VF, VecElemBits,
+ VecElemBits * NumVecElems / 8, VecTy);
+ if (TargetVF != VF && TargetVF < NumVecElems) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because TargetVF="
+ << TargetVF << " != VF=" << VF
+ << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
+ continue;
+ }
+
+ // Is a load/store with this alignment allowed by TTI and at least as fast
+ // as an unvectorized load/store?
+ //
+ // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
+ auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
+ &F = F](Align Alignment) {
+ if (Alignment.value() % SizeBytes == 0)
+ return true;
+ unsigned VectorizedSpeed = 0;
+ bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
+ F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
+ if (!AllowsMisaligned) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: Access of " << SizeBytes << "B in addrspace "
+ << AS << " with alignment " << Alignment.value()
+ << " is misaligned, and therefore can't be vectorized.\n");
+ return false;
+ }
+
+ unsigned ElementwiseSpeed = 0;
+ (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
+ Alignment, &ElementwiseSpeed);
+ if (VectorizedSpeed < ElementwiseSpeed) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: Access of " << SizeBytes << "B in addrspace "
+ << AS << " with alignment " << Alignment.value()
+ << " has relative speed " << VectorizedSpeed
+ << ", which is lower than the elementwise speed of "
+ << ElementwiseSpeed
+ << ". Therefore this access won't be vectorized.\n");
+ return false;
+ }
+ return true;
+ };
+
+ // If we're loading/storing from an alloca, align it if possible.
+ //
+ // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
+ // tells us this is beneficial. This feels a bit odd, but it matches
+ // existing tests. This isn't *so* bad, because at most we align to 4
+ // bytes (current value of StackAdjustedAlignment).
+ //
+ // FIXME: We will upgrade the alignment of the alloca even if it turns out
+ // we can't vectorize for some other reason.
+ Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
+ bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
+ isa<AllocaInst>(PtrOperand->stripPointerCasts());
+ Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
+ Align PrefAlign = Align(StackAdjustedAlignment);
+ if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
+ IsAllowedAndFast(PrefAlign)) {
+ Align NewAlign = getOrEnforceKnownAlignment(
+ PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
+ if (NewAlign >= Alignment) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: splitByChain upgrading alloca alignment from "
+ << Alignment.value() << " to " << NewAlign.value()
+ << "\n");
+ Alignment = NewAlign;
+ }
+ }
+
+ if (!IsAllowedAndFast(Alignment)) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because its alignment is not AllowedAndFast: "
+ << Alignment.value() << "\n");
+ continue;
+ }
+
+ if ((IsLoadChain &&
+ !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
+ (!IsLoadChain &&
+ !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+ "because !isLegalToVectorizeLoad/StoreChain.");
+ continue;
+ }
+
+ // Hooray, we can vectorize this chain!
+ Chain &NewChain = Ret.emplace_back();
+ for (unsigned I = CBegin; I <= CEnd; ++I)
+ NewChain.push_back(C[I]);
+ CBegin = CEnd; // Skip over the instructions we've added to the chain.
+ break;
+ }
+ }
+ return Ret;
+}
+
+bool Vectorizer::vectorizeChain(Chain &C) {
+ if (C.size() < 2)
return false;
- // In case if we have to shrink the pointer
- // stripAndAccumulateInBoundsConstantOffsets should properly handle a
- // possible overflow and the value should fit into a smallest data type
- // used in the cast/gep chain.
- assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
- OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+ sortChainInOffsetOrder(C);
- OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
- OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
- PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+ LLVM_DEBUG({
+ dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
+ dumpChain(C);
+ });
- APInt OffsetDelta = OffsetB - OffsetA;
+ Type *VecElemTy = getChainElemTy(C);
+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+ unsigned ChainBytes = std::accumulate(
+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ });
+ assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
+ // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
+ // than 1 byte (e.g. VecTy == <32 x i1>).
+ Type *VecTy = FixedVectorType::get(
+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+
+ Align Alignment = getLoadStoreAlignment(C[0].Inst);
+ // If this is a load/store of an alloca, we might have upgraded the alloca's
+ // alignment earlier. Get the new alignment.
+ if (AS == DL.getAllocaAddrSpace()) {
+ Alignment = std::max(
+ Alignment,
+ getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
+ MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
+ }
- // Check if they are based on the same pointer. That makes the offsets
- // sufficient.
- if (PtrA == PtrB)
- return OffsetDelta == PtrDelta;
-
- // Compute the necessary base pointer delta to have the necessary final delta
- // equal to the pointer delta requested.
- APInt BaseDelta = PtrDelta - OffsetDelta;
-
- // Compute the distance with SCEV between the base pointers.
- const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
- const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
- const SCEV *C = SE.getConstant(BaseDelta);
- const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
- if (X == PtrSCEVB)
+ // All elements of the chain must have the same scalar-type size.
+#ifndef NDEBUG
+ for (const ChainElem &E : C)
+ assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
+ DL.getTypeStoreSize(VecElemTy));
+#endif
+
+ Instruction *VecInst;
+ if (IsLoadChain) {
+ // Loads get hoisted to the location of the first load in the chain. We may
+ // also need to hoist the (transitive) operands of the loads.
+ Builder.SetInsertPoint(
+ std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
+ return A.Inst->comesBefore(B.Inst);
+ })->Inst);
+
+ // Chain is in offset order, so C[0] is the instr with the lowest offset,
+ // i.e. the root of the vector.
+ Value *Bitcast = Builder.CreateBitCast(
+ getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));
+ VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+
+ unsigned VecIdx = 0;
+ for (const ChainElem &E : C) {
+ Instruction *I = E.Inst;
+ Value *V;
+ Type *T = getLoadStoreType(I);
+ if (auto *VT = dyn_cast<FixedVectorType>(T)) {
+ auto Mask = llvm::to_vector<8>(
+ llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
+ V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
+ VecIdx += VT->getNumElements();
+ } else {
+ V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
+ I->getName());
+ ++VecIdx;
+ }
+ if (V->getType() != I->getType())
+ V = Builder.CreateBitOrPointerCast(V, I->getType());
+ I->replaceAllUsesWith(V);
+ }
+
+ // Finally, we need to reorder the instrs in the BB so that the (transitive)
+ // operands of VecInst appear before it. To see why, suppose we have
+ // vectorized the following code:
+ //
+ // ptr1 = gep a, 1
+ // load1 = load i32 ptr1
+ // ptr0 = gep a, 0
+ // load0 = load i32 ptr0
+ //
+ // We will put the vectorized load at the location of the earliest load in
+ // the BB, i.e. load1. We get:
+ //
+ // ptr1 = gep a, 1
+ // loadv = load <2 x i32> ptr0
+ // load0 = extractelement loadv, 0
+ // load1 = extractelement loadv, 1
+ // ptr0 = gep a, 0
+ //
+ // Notice that loadv uses ptr0, which is defined *after* it!
+ reorder(VecInst);
+ } else {
+ // Stores get sunk to the location of the last store in the chain.
+ Builder.SetInsertPoint(
+ std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
+ return A.Inst->comesBefore(B.Inst);
+ })->Inst);
+
+ // Build the vector to store.
+ Value *Vec = PoisonValue::get(VecTy);
+ unsigned VecIdx = 0;
+ auto InsertElem = [&](Value *V) {
+ if (V->getType() != VecElemTy)
+ V = Builder.CreateBitOrPointerCast(V, VecElemTy);
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ };
+ for (const ChainElem &E : C) {
+ auto I = cast<StoreInst>(E.Inst);
+ if (FixedVectorType *VT =
+ dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
+ for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
+ InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
+ Builder.getInt32(J)));
+ }
+ } else {
+ InsertElem(I->getValueOperand());
+ }
+ }
+
+ // Chain is in offset order, so C[0] is the instr with the lowest offset,
+ // i.e. the root of the vector.
+ VecInst = Builder.CreateAlignedStore(
+ Vec,
+ Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),
+ VecTy->getPointerTo(AS)),
+ Alignment);
+ }
+
+ propagateMetadata(VecInst, C);
+
+ for (const ChainElem &E : C)
+ ToErase.push_back(E.Inst);
+
+ ++NumVectorInstructions;
+ NumScalarsVectorized += C.size();
+ return true;
+}
+
+template <bool IsLoadChain>
+bool Vectorizer::isSafeToMove(
+ Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
+ LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
+ << *ChainBegin << ")\n");
+
+ assert(isa<LoadInst>(ChainElem) == IsLoadChain);
+ if (ChainElem == ChainBegin)
return true;
- // The above check will not catch the cases where one of the pointers is
- // factorized but the other one is not, such as (C + (S * (A + B))) vs
- // (AS + BS). Get the minus scev. That will allow re-combining the expresions
- // and getting the simplified difference.
- const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
- if (C == Dist)
+ // Invariant loads can always be reordered; by definition they are not
+ // clobbered by stores.
+ if (isInvariantLoad(ChainElem))
return true;
- // Sometimes even this doesn't work, because SCEV can't always see through
- // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
- // things the hard way.
- return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+ auto BBIt = std::next([&] {
+ if constexpr (IsLoadChain)
+ return BasicBlock::reverse_iterator(ChainElem);
+ else
+ return BasicBlock::iterator(ChainElem);
+ }());
+ auto BBItEnd = std::next([&] {
+ if constexpr (IsLoadChain)
+ return BasicBlock::reverse_iterator(ChainBegin);
+ else
+ return BasicBlock::iterator(ChainBegin);
+ }());
+
+ const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
+ const unsigned ChainElemSize =
+ DL.getTypeStoreSize(getLoadStoreType(ChainElem));
+
+ for (; BBIt != BBItEnd; ++BBIt) {
+ Instruction *I = &*BBIt;
+
+ if (!I->mayReadOrWriteMemory())
+ continue;
+
+ // Loads can be reordered with other loads.
+ if (IsLoadChain && isa<LoadInst>(I))
+ continue;
+
+ // Stores can be sunk below invariant loads.
+ if (!IsLoadChain && isInvariantLoad(I))
+ continue;
+
+ // If I is in the chain, we can tell whether it aliases ChainIt by checking
+ // what offset ChainIt accesses. This may be better than AA is able to do.
+ //
+ // We should really only have duplicate offsets for stores (the duplicate
+ // loads should be CSE'ed), but in case we have a duplicate load, we'll
+ // split the chain so we don't have to handle this case specially.
+ if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
+ // I and ChainElem overlap if:
+ // - I and ChainElem have the same offset, OR
+ // - I's offset is less than ChainElem's, but I touches past the
+ // beginning of ChainElem, OR
+ // - ChainElem's offset is less than I's, but ChainElem touches past the
+ // beginning of I.
+ const APInt &IOffset = OffsetIt->second;
+ unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
+ if (IOffset == ChainElemOffset ||
+ (IOffset.sle(ChainElemOffset) &&
+ (IOffset + IElemSize).sgt(ChainElemOffset)) ||
+ (ChainElemOffset.sle(IOffset) &&
+ (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
+ LLVM_DEBUG({
+ // Double check that AA also sees this alias. If not, we probably
+ // have a bug.
+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+ assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
+ dbgs() << "LSV: Found alias in chain: " << *I << "\n";
+ });
+ return false; // We found an aliasing instruction; bail.
+ }
+
+ continue; // We're confident there's no alias.
+ }
+
+ LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+ if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
+ LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
+ << " Aliasing instruction:\n"
+ << " " << *I << '\n'
+ << " Aliased instruction and pointer:\n"
+ << " " << *ChainElem << '\n'
+ << " " << *getLoadStorePointerOperand(ChainElem)
+ << '\n');
+
+ return false;
+ }
+ }
+ return true;
}
static bool checkNoWrapFlags(Instruction *I, bool Signed) {
@@ -395,10 +1097,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) {
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
unsigned MatchingOpIdxA, Instruction *AddOpB,
unsigned MatchingOpIdxB, bool Signed) {
- // If both OpA and OpB is an add with NSW/NUW and with
- // one of the operands being the same, we can guarantee that the
- // transformation is safe if we can prove that OpA won't overflow when
- // IdxDiff added to the other operand of OpA.
+ LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
+ << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
+ << MatchingOpIdxA << ", AddOpB=" << *AddOpB
+ << ", MatchingOpIdxB=" << MatchingOpIdxB
+ << ", Signed=" << Signed << "\n");
+ // If both OpA and OpB are adds with NSW/NUW and with one of the operands
+ // being the same, we can guarantee that the transformation is safe if we can
+ // prove that OpA won't overflow when Ret added to the other operand of OpA.
// For example:
// %tmp7 = add nsw i32 %tmp2, %v0
// %tmp8 = sext i32 %tmp7 to i64
@@ -407,10 +1113,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
// %tmp12 = add nsw i32 %tmp2, %tmp11
// %tmp13 = sext i32 %tmp12 to i64
//
- // Both %tmp7 and %tmp2 has the nsw flag and the first operand
- // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
- // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
- // nsw flag.
+ // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
+ // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
+ // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
assert(AddOpA->getOpcode() == Instruction::Add &&
AddOpB->getOpcode() == Instruction::Add &&
checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
@@ -461,24 +1166,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
return false;
}
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
- APInt PtrDelta,
- unsigned Depth) const {
+std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
+ << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
+ << " Depth=" << Depth << "\n");
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
if (!GEPA || !GEPB)
- return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+ return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
// Look through GEPs after checking they're the same except for the last
// index.
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
GEPA->getPointerOperand() != GEPB->getPointerOperand())
- return false;
+ return std::nullopt;
gep_type_iterator GTIA = gep_type_begin(GEPA);
gep_type_iterator GTIB = gep_type_begin(GEPB);
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
if (GTIA.getOperand() != GTIB.getOperand())
- return false;
+ return std::nullopt;
++GTIA;
++GTIB;
}
@@ -487,23 +1194,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
- return false;
+ return std::nullopt;
- if (PtrDelta.isNegative()) {
- if (PtrDelta.isMinSignedValue())
- return false;
- PtrDelta.negate();
- std::swap(OpA, OpB);
- }
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
- if (PtrDelta.urem(Stride) != 0)
- return false;
- unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
- APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
- return false;
+ return std::nullopt;
bool Signed = isa<SExtInst>(OpA);
@@ -511,7 +1208,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
if (!OpB || ValA->getType() != OpB->getType())
- return false;
+ return std::nullopt;
+
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
+ if (IdxDiffSCEV == SE.getCouldNotCompute())
+ return std::nullopt;
+
+ ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
+ if (!IdxDiffRange.isSingleElement())
+ return std::nullopt;
+ APInt IdxDiff = *IdxDiffRange.getSingleElement();
+
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
+ << "\n");
// Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
@@ -530,10 +1241,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
checkNoWrapFlags(OpB, Signed)) {
- // In the checks below a matching operand in OpA and OpB is
- // an operand which is the same in those two instructions.
- // Below we account for possible orders of the operands of
- // these add instructions.
+ // In the checks below a matching operand in OpA and OpB is an operand which
+ // is the same in those two instructions. Below we account for possible
+ // orders of the operands of these add instructions.
for (unsigned MatchingOpIdxA : {0, 1})
for (unsigned MatchingOpIdxB : {0, 1})
if (!Safe)
@@ -544,802 +1254,267 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
// Third attempt:
- // If all set bits of IdxDiff or any higher order bit other than the sign bit
- // are known to be zero in ValA, we can add Diff to it while guaranteeing no
- // overflow of any sort.
+ //
+ // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
+ // order bit other than the sign bit are known to be zero in ValA, we can add
+ // Diff to it while guaranteeing no overflow of any sort.
+ //
+ // If IdxDiff is negative, do the same, but swap ValA and ValB.
if (!Safe) {
+ // When computing known bits, use the GEPs as context instructions, since
+ // they likely are in the same BB as the load/store.
KnownBits Known(BitWidth);
- computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
+ computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
+ ContextInst, &DT);
APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
if (Signed)
BitsAllowedToBeSet.clearBit(BitWidth - 1);
- if (BitsAllowedToBeSet.ult(IdxDiff))
- return false;
+ if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
+ return std::nullopt;
+ Safe = true;
}
- const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
- const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
- const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
- return X == OffsetSCEVB;
+ if (Safe)
+ return IdxDiff * Stride;
+ return std::nullopt;
}
-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
- const APInt &PtrDelta,
- unsigned Depth) const {
+std::optional<APInt> Vectorizer::getConstantOffsetSelects(
+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
if (Depth++ == MaxDepth)
- return false;
+ return std::nullopt;
if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
- return SelectA->getCondition() == SelectB->getCondition() &&
- areConsecutivePointers(SelectA->getTrueValue(),
- SelectB->getTrueValue(), PtrDelta, Depth) &&
- areConsecutivePointers(SelectA->getFalseValue(),
- SelectB->getFalseValue(), PtrDelta, Depth);
+ if (SelectA->getCondition() != SelectB->getCondition())
+ return std::nullopt;
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
+ << ", PtrB=" << *PtrB << ", ContextInst="
+ << *ContextInst << ", Depth=" << Depth << "\n");
+ std::optional<APInt> TrueDiff = getConstantOffset(
+ SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
+ if (!TrueDiff.has_value())
+ return std::nullopt;
+ std::optional<APInt> FalseDiff =
+ getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
+ ContextInst, Depth);
+ if (TrueDiff == FalseDiff)
+ return TrueDiff;
}
}
- return false;
+ return std::nullopt;
}
-void Vectorizer::reorder(Instruction *I) {
- SmallPtrSet<Instruction *, 16> InstructionsToMove;
- SmallVector<Instruction *, 16> Worklist;
-
- Worklist.push_back(I);
- while (!Worklist.empty()) {
- Instruction *IW = Worklist.pop_back_val();
- int NumOperands = IW->getNumOperands();
- for (int i = 0; i < NumOperands; i++) {
- Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
- if (!IM || IM->getOpcode() == Instruction::PHI)
- continue;
-
- // If IM is in another BB, no need to move it, because this pass only
- // vectorizes instructions within one BB.
- if (IM->getParent() != I->getParent())
- continue;
-
- if (!IM->comesBefore(I)) {
- InstructionsToMove.insert(IM);
- Worklist.push_back(IM);
- }
+EquivalenceClassMap
+Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
+ BasicBlock::iterator End) {
+ EquivalenceClassMap Ret;
+
+ auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
+ const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share
+ // the same condition and evaluate to consecutive pointers for true and
+ // false values of the condition. Therefore using the select's themselves
+ // for grouping instructions would put consecutive accesses into different
+ // lists and they won't be even checked for being consecutive, and won't
+ // be vectorized.
+ return Sel->getCondition();
}
- }
+ return ObjPtr;
+ };
- // All instructions to move should follow I. Start from I, not from begin().
- for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
- ++BBI) {
- if (!InstructionsToMove.count(&*BBI))
+ for (Instruction &I : make_range(Begin, End)) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!LI && !SI)
continue;
- Instruction *IM = &*BBI;
- --BBI;
- IM->removeFromParent();
- IM->insertBefore(I);
- }
-}
-
-std::pair<BasicBlock::iterator, BasicBlock::iterator>
-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
- Instruction *C0 = Chain[0];
- BasicBlock::iterator FirstInstr = C0->getIterator();
- BasicBlock::iterator LastInstr = C0->getIterator();
- BasicBlock *BB = C0->getParent();
- unsigned NumFound = 0;
- for (Instruction &I : *BB) {
- if (!is_contained(Chain, &I))
+ if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
continue;
- ++NumFound;
- if (NumFound == 1) {
- FirstInstr = I.getIterator();
- }
- if (NumFound == Chain.size()) {
- LastInstr = I.getIterator();
- break;
- }
- }
-
- // Range is [first, last).
- return std::make_pair(FirstInstr, ++LastInstr);
-}
-
-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
- SmallVector<Instruction *, 16> Instrs;
- for (Instruction *I : Chain) {
- Value *PtrOperand = getLoadStorePointerOperand(I);
- assert(PtrOperand && "Instruction must have a pointer operand.");
- Instrs.push_back(I);
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
- Instrs.push_back(GEP);
- }
-
- // Erase instructions.
- for (Instruction *I : Instrs)
- if (I->use_empty())
- I->eraseFromParent();
-}
-
-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
- unsigned ElementSizeBits) {
- unsigned ElementSizeBytes = ElementSizeBits / 8;
- unsigned SizeBytes = ElementSizeBytes * Chain.size();
- unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
- if (NumLeft == Chain.size()) {
- if ((NumLeft & 1) == 0)
- NumLeft /= 2; // Split even in half
- else
- --NumLeft; // Split off last element
- } else if (NumLeft == 0)
- NumLeft = 1;
- return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
-}
-
-ArrayRef<Instruction *>
-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
- // These are in BB order, unlike Chain, which is in address order.
- SmallVector<Instruction *, 16> MemoryInstrs;
- SmallVector<Instruction *, 16> ChainInstrs;
-
- bool IsLoadChain = isa<LoadInst>(Chain[0]);
- LLVM_DEBUG({
- for (Instruction *I : Chain) {
- if (IsLoadChain)
- assert(isa<LoadInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- else
- assert(isa<StoreInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- }
- });
-
- for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
- if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
- ChainInstrs.push_back(&I);
+ if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
+ (SI && !TTI.isLegalToVectorizeStore(SI)))
continue;
- }
- if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
- LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
- << I << '\n');
- break;
- }
- if (I.mayReadOrWriteMemory())
- MemoryInstrs.push_back(&I);
- }
-
- // Loop until we find an instruction in ChainInstrs that we can't vectorize.
- unsigned ChainInstrIdx = 0;
- Instruction *BarrierMemoryInstr = nullptr;
-
- for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
- Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
-
- // If a barrier memory instruction was found, chain instructions that follow
- // will not be added to the valid prefix.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
- break;
- // Check (in BB order) if any instruction prevents ChainInstr from being
- // vectorized. Find and store the first such "conflicting" instruction.
- for (Instruction *MemInstr : MemoryInstrs) {
- // If a barrier memory instruction was found, do not check past it.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
- break;
-
- auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
- auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
- if (MemLoad && ChainLoad)
- continue;
-
- // We can ignore the alias if the we have a load store pair and the load
- // is known to be invariant. The load cannot be clobbered by the store.
- auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
- return LI->hasMetadata(LLVMContext::MD_invariant_load);
- };
-
- if (IsLoadChain) {
- // We can ignore the alias as long as the load comes before the store,
- // because that means we won't be moving the load past the store to
- // vectorize it (the vectorized load is inserted at the location of the
- // first load in the chain).
- if (ChainInstr->comesBefore(MemInstr) ||
- (ChainLoad && IsInvariantLoad(ChainLoad)))
- continue;
- } else {
- // Same case, but in reverse.
- if (MemInstr->comesBefore(ChainInstr) ||
- (MemLoad && IsInvariantLoad(MemLoad)))
- continue;
- }
-
- ModRefInfo MR =
- AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
- if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
- LLVM_DEBUG({
- dbgs() << "LSV: Found alias:\n"
- " Aliasing instruction:\n"
- << " " << *MemInstr << '\n'
- << " Aliased instruction and pointer:\n"
- << " " << *ChainInstr << '\n'
- << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
- });
- // Save this aliasing memory instruction as a barrier, but allow other
- // instructions that precede the barrier to be vectorized with this one.
- BarrierMemoryInstr = MemInstr;
- break;
- }
- }
- // Continue the search only for store chains, since vectorizing stores that
- // precede an aliasing load is valid. Conversely, vectorizing loads is valid
- // up to an aliasing store, but should not pull loads from further down in
- // the basic block.
- if (IsLoadChain && BarrierMemoryInstr) {
- // The BarrierMemoryInstr is a store that precedes ChainInstr.
- assert(BarrierMemoryInstr->comesBefore(ChainInstr));
- break;
- }
- }
-
- // Find the largest prefix of Chain whose elements are all in
- // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
- // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
- // order.)
- SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
- ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
- unsigned ChainIdx = 0;
- for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
- if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
- break;
- }
- return Chain.slice(0, ChainIdx);
-}
-
-static ChainID getChainID(const Value *Ptr) {
- const Value *ObjPtr = getUnderlyingObject(Ptr);
- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
- // The select's themselves are distinct instructions even if they share the
- // same condition and evaluate to consecutive pointers for true and false
- // values of the condition. Therefore using the select's themselves for
- // grouping instructions would put consecutive accesses into different lists
- // and they won't be even checked for being consecutive, and won't be
- // vectorized.
- return Sel->getCondition();
- }
- return ObjPtr;
-}
-
-std::pair<InstrListMap, InstrListMap>
-Vectorizer::collectInstructions(BasicBlock *BB) {
- InstrListMap LoadRefs;
- InstrListMap StoreRefs;
-
- for (Instruction &I : *BB) {
- if (!I.mayReadOrWriteMemory())
+ Type *Ty = getLoadStoreType(&I);
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (!LI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeLoad(LI))
- continue;
-
- Type *Ty = LI->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- Value *Ptr = LI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- // Save the load locations.
- const ChainID ID = getChainID(Ptr);
- LoadRefs[ID].push_back(LI);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeStore(SI))
- continue;
-
- Type *Ty = SI->getValueOperand()->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- Value *Ptr = SI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- // Save store location.
- const ChainID ID = getChainID(Ptr);
- StoreRefs[ID].push_back(SI);
- }
- }
-
- return {LoadRefs, StoreRefs};
-}
-
-bool Vectorizer::vectorizeChains(InstrListMap &Map) {
- bool Changed = false;
-
- for (const std::pair<ChainID, InstrList> &Chain : Map) {
- unsigned Size = Chain.second.size();
- if (Size < 2)
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
continue;
- LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
-
- // Process the stores in chunks of 64.
- for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
- unsigned Len = std::min<unsigned>(CE - CI, 64);
- ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
- Changed |= vectorizeInstructions(Chunk);
- }
- }
-
- return Changed;
-}
-
-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
- LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
- << " instructions.\n");
- SmallVector<int, 16> Heads, Tails;
- int ConsecutiveChain[64];
-
- // Do a quadratic search on all of the given loads/stores and find all of the
- // pairs of loads/stores that follow each other.
- for (int i = 0, e = Instrs.size(); i < e; ++i) {
- ConsecutiveChain[i] = -1;
- for (int j = e - 1; j >= 0; --j) {
- if (i == j)
- continue;
-
- if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
- if (ConsecutiveChain[i] != -1) {
- int CurDistance = std::abs(ConsecutiveChain[i] - i);
- int NewDistance = std::abs(ConsecutiveChain[i] - j);
- if (j < i || NewDistance > CurDistance)
- continue; // Should not insert.
- }
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
- Tails.push_back(j);
- Heads.push_back(i);
- ConsecutiveChain[i] = j;
- }
- }
- }
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- bool Changed = false;
- SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
- for (int Head : Heads) {
- if (InstructionsProcessed.count(Instrs[Head]))
+ // Only handle power-of-two sized elements.
+ if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
+ (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
continue;
- bool LongerChainExists = false;
- for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
- if (Head == Tails[TIt] &&
- !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
- LongerChainExists = true;
- break;
- }
- if (LongerChainExists)
- continue;
-
- // We found an instr that starts a chain. Now follow the chain and try to
- // vectorize it.
- SmallVector<Instruction *, 16> Operands;
- int I = Head;
- while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
- if (InstructionsProcessed.count(Instrs[I]))
- break;
-
- Operands.push_back(Instrs[I]);
- I = ConsecutiveChain[I];
- }
- bool Vectorized = false;
- if (isa<LoadInst>(*Operands.begin()))
- Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
- else
- Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
- Changed |= Vectorized;
+ Ret[{getUnderlyingObject(Ptr), AS,
+ DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
+ /*IsLoad=*/LI != nullptr}]
+ .push_back(&I);
}
- return Changed;
+ return Ret;
}
-bool Vectorizer::vectorizeStoreChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- StoreInst *S0 = cast<StoreInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole store.
- Type *StoreTy = nullptr;
- for (Instruction *I : Chain) {
- StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
- if (StoreTy->isIntOrIntVectorTy())
- break;
-
- if (StoreTy->isPtrOrPtrVectorTy()) {
- StoreTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(StoreTy));
- break;
- }
- }
- assert(StoreTy && "Failed to find store type");
+std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
+ if (Instrs.empty())
+ return {};
- unsigned Sz = DL.getTypeSizeInBits(StoreTy);
- unsigned AS = S0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = S0->getAlign();
+ unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
+ unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
+#ifndef NDEBUG
+ // Check that Instrs is in BB order and all have the same addr space.
+ for (size_t I = 1; I < Instrs.size(); ++I) {
+ assert(Instrs[I - 1]->comesBefore(Instrs[I]));
+ assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
}
+#endif
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
-
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
-
- FixedVectorType *VecTy;
- auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
- if (VecStoreTy)
- VecTy = FixedVectorType::get(StoreTy->getScalarType(),
- Chain.size() * VecStoreTy->getNumElements());
- else
- VecTy = FixedVectorType::get(StoreTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- bool Vectorized = false;
- Vectorized |=
- vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
- Vectorized |=
- vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
- return Vectorized;
- }
-
- LLVM_DEBUG({
- dbgs() << "LSV: Stores to vectorize:\n";
- for (Instruction *I : Chain)
- dbgs() << " " << *I << "\n";
- });
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the store is going to be misaligned, don't vectorize it.
- unsigned RelativeSpeed;
- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
- if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- unsigned SpeedBefore;
- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
- if (SpeedBefore > RelativeSpeed)
- return false;
-
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
- return Vectorized;
+ // Machinery to build an MRU-hashtable of Chains.
+ //
+ // (Ideally this could be done with MapVector, but as currently implemented,
+ // moving an element to the front of a MapVector is O(n).)
+ struct InstrListElem : ilist_node<InstrListElem>,
+ std::pair<Instruction *, Chain> {
+ explicit InstrListElem(Instruction *I)
+ : std::pair<Instruction *, Chain>(I, {}) {}
+ };
+ struct InstrListElemDenseMapInfo {
+ using PtrInfo = DenseMapInfo<InstrListElem *>;
+ using IInfo = DenseMapInfo<Instruction *>;
+ static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
+ static InstrListElem *getTombstoneKey() {
+ return PtrInfo::getTombstoneKey();
}
-
- Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, S0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
- }
-
- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
- return Vectorized;
- }
-
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*Last);
-
- Value *Vec = PoisonValue::get(VecTy);
-
- if (VecStoreTy) {
- unsigned VecWidth = VecStoreTy->getNumElements();
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
- unsigned NewIdx = J + I * VecWidth;
- Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
- Builder.getInt32(J));
- if (Extract->getType() != StoreTy->getScalarType())
- Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
- Vec = Insert;
- }
+ static unsigned getHashValue(const InstrListElem *E) {
+ return IInfo::getHashValue(E->first);
}
- } else {
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- Value *Extract = Store->getValueOperand();
- if (Extract->getType() != StoreTy->getScalarType())
- Extract =
- Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
- Vec = Insert;
+ static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
+ if (A == getEmptyKey() || B == getEmptyKey())
+ return A == getEmptyKey() && B == getEmptyKey();
+ if (A == getTombstoneKey() || B == getTombstoneKey())
+ return A == getTombstoneKey() && B == getTombstoneKey();
+ return IInfo::isEqual(A->first, B->first);
}
- }
-
- StoreInst *SI = Builder.CreateAlignedStore(
- Vec,
- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
- Alignment);
- propagateMetadata(SI, Chain);
-
- eraseInstructions(Chain);
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::vectorizeLoadChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- LoadInst *L0 = cast<LoadInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole load.
- Type *LoadTy = nullptr;
- for (const auto &V : Chain) {
- LoadTy = cast<LoadInst>(V)->getType();
- if (LoadTy->isIntOrIntVectorTy())
- break;
-
- if (LoadTy->isPtrOrPtrVectorTy()) {
- LoadTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(LoadTy));
- break;
+ };
+ SpecificBumpPtrAllocator<InstrListElem> Allocator;
+ simple_ilist<InstrListElem> MRU;
+ DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
+
+ // Compare each instruction in `instrs` to leader of the N most recently-used
+ // chains. This limits the O(n^2) behavior of this pass while also allowing
+ // us to build arbitrarily long chains.
+ for (Instruction *I : Instrs) {
+ constexpr int MaxChainsToTry = 64;
+
+ bool MatchFound = false;
+ auto ChainIter = MRU.begin();
+ for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
+ ++J, ++ChainIter) {
+ std::optional<APInt> Offset = getConstantOffset(
+ getLoadStorePointerOperand(ChainIter->first),
+ getLoadStorePointerOperand(I),
+ /*ContextInst=*/
+ (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
+ if (Offset.has_value()) {
+ // `Offset` might not have the expected number of bits, if e.g. AS has a
+ // different number of bits than opaque pointers.
+ ChainIter->second.push_back(ChainElem{I, Offset.value()});
+ // Move ChainIter to the front of the MRU list.
+ MRU.remove(*ChainIter);
+ MRU.push_front(*ChainIter);
+ MatchFound = true;
+ break;
+ }
}
- }
- assert(LoadTy && "Can't determine LoadInst type from chain");
-
- unsigned Sz = DL.getTypeSizeInBits(LoadTy);
- unsigned AS = L0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = L0->getAlign();
-
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
-
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
- VectorType *VecTy;
- auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
- if (VecLoadTy)
- VecTy = FixedVectorType::get(LoadTy->getScalarType(),
- Chain.size() * VecLoadTy->getNumElements());
- else
- VecTy = FixedVectorType::get(LoadTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- bool Vectorized = false;
- Vectorized |=
- vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
- Vectorized |=
- vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
- return Vectorized;
- }
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the load is going to be misaligned, don't vectorize it.
- unsigned RelativeSpeed;
- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
- if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- unsigned SpeedBefore;
- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
- if (SpeedBefore > RelativeSpeed)
- return false;
-
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
- return Vectorized;
+ if (!MatchFound) {
+ APInt ZeroOffset(ASPtrBits, 0);
+ InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
+ E->second.push_back(ChainElem{I, ZeroOffset});
+ MRU.push_front(*E);
+ Chains.insert(E);
}
-
- Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, L0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
}
- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- bool Vectorized = false;
- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
- return Vectorized;
- }
+ std::vector<Chain> Ret;
+ Ret.reserve(Chains.size());
+ // Iterate over MRU rather than Chains so the order is deterministic.
+ for (auto &E : MRU)
+ if (E.second.size() > 1)
+ Ret.push_back(std::move(E.second));
+ return Ret;
+}
- LLVM_DEBUG({
- dbgs() << "LSV: Loads to vectorize:\n";
- for (Instruction *I : Chain)
- I->dump();
- });
+std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
+ Instruction *ContextInst,
+ unsigned Depth) {
+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
+ << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
+ << ", Depth=" << Depth << "\n");
+ // We'll ultimately return a value of this bit width, even if computations
+ // happen in a different width.
+ unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(OrigBitWidth, 0);
+ APInt OffsetB(OrigBitWidth, 0);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ return std::nullopt;
- // getVectorizablePrefix already computed getBoundaryInstrs. The value of
- // Last may have changed since then, but the value of First won't have. If it
- // matters, we could compute getBoundaryInstrs only once and reuse it here.
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*First);
-
- Value *Bitcast =
- Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
- LoadInst *LI =
- Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
- propagateMetadata(LI, Chain);
-
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- Value *CV = Chain[I];
- Value *V;
- if (VecLoadTy) {
- // Extract a subvector using shufflevector.
- unsigned VecWidth = VecLoadTy->getNumElements();
- auto Mask =
- llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
- V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
- } else {
- V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
- }
+ // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
+ // should properly handle a possible overflow and the value should fit into
+ // the smallest data type used in the cast/gep chain.
+ assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
+ OffsetB.getSignificantBits() <= NewPtrBitWidth);
- if (V->getType() != CV->getType()) {
- V = Builder.CreateBitOrPointerCast(V, CV->getType());
+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+ if (PtrA == PtrB)
+ return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
+
+ // Try to compute B - A.
+ const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
+ if (DistScev != SE.getCouldNotCompute()) {
+ LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
+ ConstantRange DistRange = SE.getSignedRange(DistScev);
+ if (DistRange.isSingleElement()) {
+ // Handle index width (the width of Dist) != pointer width (the width of
+ // the Offset*s at this point).
+ APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
+ return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
}
-
- // Replace the old instruction.
- CV->replaceAllUsesWith(V);
}
-
- // Since we might have opaque pointers we might end up using the pointer
- // operand of the first load (wrt. memory loaded) for the vector load. Since
- // this first load might not be the first in the block we potentially need to
- // reorder the pointer operand (and its operands). If we have a bitcast though
- // it might be before the load and should be the reorder start instruction.
- // "Might" because for opaque pointers the "bitcast" is just the first loads
- // pointer operand, as oppposed to something we inserted at the right position
- // ourselves.
- Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
- reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
-
- eraseInstructions(Chain);
-
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- Align Alignment, unsigned &RelativeSpeed) {
- RelativeSpeed = 0;
- if (Alignment.value() % SzInBytes == 0)
- return false;
-
- bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
- SzInBytes * 8, AddressSpace,
- Alignment, &RelativeSpeed);
- LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
- << " with relative speed = " << RelativeSpeed << '\n';);
- return !Allows || !RelativeSpeed;
+ std::optional<APInt> Diff =
+ getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
+ if (Diff.has_value())
+ return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
+ .sextOrTrunc(OrigBitWidth);
+ return std::nullopt;
}