src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:03:47 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:04:23 +0000
commit	7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree	27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
parent	e3b557809604d036af6e00c60f012c2025b59a5e (diff)
download	src-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip

vendor/llvm-project/llvmorg-17-init-19304-gd0b54bb50e51

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

2049

1 files changed, 1112 insertions, 937 deletions

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 0b7fc853dc1b..260d7889906b 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -37,13 +37,34 @@

// multiple scalar registers, similar to a GPU vectorized load. In theory ARM

// could use this pass (with some modifications), but currently it implements

// its own pass to do something similar to what we do here.

+//

+// Overview of the algorithm and terminology in this pass:

+//

+// - Break up each basic block into pseudo-BBs, composed of instructions which

+// are guaranteed to transfer control to their successors.

+// - Within a single pseudo-BB, find all loads, and group them into

+// "equivalence classes" according to getUnderlyingObject() and loaded

+// element size. Do the same for stores.

+// - For each equivalence class, greedily build "chains". Each chain has a

+// leader instruction, and every other member of the chain has a known

+// constant offset from the first instr in the chain.

+// - Break up chains so that they contain only contiguous accesses of legal

+// size with no intervening may-alias instrs.

+// - Convert each chain to vector instructions.

+//

+// The O(n^2) behavior of this pass comes from initially building the chains.

+// In the worst case we have to compare each new instruction to all of those

+// that came before. To limit this, we only calculate the offset to the leaders

+// of the N most recently-used chains.

#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

+#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

+#include "llvm/ADT/Sequence.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

@@ -57,6 +78,7 @@

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/BasicBlock.h"

+#include "llvm/IR/ConstantRange.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DerivedTypes.h"

@@ -67,23 +89,33 @@

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

+#include "llvm/IR/LLVMContext.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

+#include "llvm/Support/Alignment.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

+#include "llvm/Support/ModRef.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

#include <cassert>

+#include <cstdint>

#include <cstdlib>

+#include <iterator>

+#include <limits>

+#include <numeric>

+#include <optional>

#include <tuple>

+#include <type_traits>

#include <utility>

+#include <vector>

using namespace llvm;

@@ -92,21 +124,115 @@ using namespace llvm;

STATISTIC(NumVectorInstructions, "Number of vector accesses generated");

STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");

+namespace {

+// Equivalence class key, the initial tuple by which we group loads/stores.

+// Loads/stores with different EqClassKeys are never merged.

+//

+// (We could in theory remove element-size from the this tuple. We'd just need

+// to fix up the vector packing/unpacking code.)

+using EqClassKey =

+ std::tuple<const Value * /* result of getUnderlyingObject() */,

+ unsigned /* AddrSpace */,

+ unsigned /* Load/Store element size bits */,

+ char /* IsLoad; char b/c bool can't be a DenseMap key */

+ >;

+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,

+ const EqClassKey &K) {

+ const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;

+ return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject

+ << " of element size " << ElementSize << " bits in addrspace "

+ << AddrSpace;

+// A Chain is a set of instructions such that:

+// - All instructions have the same equivalence class, so in particular all are

+// loads, or all are stores.

+// - We know the address accessed by the i'th chain elem relative to the

+// chain's leader instruction, which is the first instr of the chain in BB

+// order.

+//

+// Chains have two canonical orderings:

+// - BB order, sorted by Instr->comesBefore.

+// - Offset order, sorted by OffsetFromLeader.

+// This pass switches back and forth between these orders.

+struct ChainElem {

+ Instruction *Inst;

+ APInt OffsetFromLeader;

+};

+using Chain = SmallVector<ChainElem, 1>;

+void sortChainInBBOrder(Chain &C) {

+ sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });

+void sortChainInOffsetOrder(Chain &C) {

+ sort(C, [](const auto &A, const auto &B) {

+ if (A.OffsetFromLeader != B.OffsetFromLeader)

+ return A.OffsetFromLeader.slt(B.OffsetFromLeader);

+ return A.Inst->comesBefore(B.Inst); // stable tiebreaker

+ });

+[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {

+ for (const auto &E : C) {

+ dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";

+ }

+using EquivalenceClassMap =

+ MapVector<EqClassKey, SmallVector<Instruction *, 8>>;

// FIXME: Assuming stack alignment of 4 is always good enough

-static const unsigned StackAdjustedAlignment = 4;

+constexpr unsigned StackAdjustedAlignment = 4;

-namespace {

+Instruction *propagateMetadata(Instruction *I, const Chain &C) {

+ SmallVector<Value *, 8> Values;

+ for (const ChainElem &E : C)

+ Values.push_back(E.Inst);

+ return propagateMetadata(I, Values);

-/// ChainID is an arbitrary token that is allowed to be different only for the

-/// accesses that are guaranteed to be considered non-consecutive by

-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions

-/// together and reducing the number of instructions the main search operates on

-/// at a time, i.e. this is to reduce compile time and nothing else as the main

-/// search has O(n^2) time complexity. The underlying type of ChainID should not

-/// be relied upon.

-using ChainID = const Value *;

-using InstrList = SmallVector<Instruction *, 8>;

-using InstrListMap = MapVector<ChainID, InstrList>;

+bool isInvariantLoad(const Instruction *I) {

+ const LoadInst *LI = dyn_cast<LoadInst>(I);

+ return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);

+/// Reorders the instructions that I depends on (the instructions defining its

+/// operands), to ensure they dominate I.

+void reorder(Instruction *I) {

+ SmallPtrSet<Instruction *, 16> InstructionsToMove;

+ SmallVector<Instruction *, 16> Worklist;

+ Worklist.push_back(I);

+ while (!Worklist.empty()) {

+ Instruction *IW = Worklist.pop_back_val();

+ int NumOperands = IW->getNumOperands();

+ for (int i = 0; i < NumOperands; i++) {

+ Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));

+ if (!IM || IM->getOpcode() == Instruction::PHI)

+ continue;

+ // If IM is in another BB, no need to move it, because this pass only

+ // vectorizes instructions within one BB.

+ if (IM->getParent() != I->getParent())

+ continue;

+ if (!IM->comesBefore(I)) {

+ InstructionsToMove.insert(IM);

+ Worklist.push_back(IM);

+ }

+ // All instructions to move should follow I. Start from I, not from begin().

+ for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {

+ Instruction *IM = &*(BBI++);

+ if (!InstructionsToMove.count(IM))

+ continue;

+ IM->moveBefore(I);

+ }

class Vectorizer {

Function &F;

@@ -118,6 +244,12 @@ class Vectorizer {

const DataLayout &DL;

IRBuilder<> Builder;

+ // We could erase instrs right after vectorizing them, but that can mess up

+ // our BB iterators, and also can make the equivalence class keys point to

+ // freed memory. This is fixable, but it's simpler just to wait until we're

+ // done with the BB and erase all at once.

+ SmallVector<Instruction *, 128> ToErase;

public:

Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,

DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)

@@ -127,70 +259,83 @@ public:

bool run();

private:

- unsigned getPointerAddressSpace(Value *I);

static const unsigned MaxDepth = 3;

- bool isConsecutiveAccess(Value *A, Value *B);

- bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,

- unsigned Depth = 0) const;

- bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,

- unsigned Depth) const;

- bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,

- unsigned Depth) const;

- /// After vectorization, reorder the instructions that I depends on

- /// (the instructions defining its operands), to ensure they dominate I.

- void reorder(Instruction *I);

- /// Returns the first and the last instructions in Chain.

- std::pair<BasicBlock::iterator, BasicBlock::iterator>

- getBoundaryInstrs(ArrayRef<Instruction *> Chain);

- /// Erases the original instructions after vectorizing.

- void eraseInstructions(ArrayRef<Instruction *> Chain);

- /// "Legalize" the vector type that would be produced by combining \p

- /// ElementSizeBits elements in \p Chain. Break into two pieces such that the

- /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is

- /// expected to have more than 4 elements.

- std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>

- splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);

- /// Finds the largest prefix of Chain that's vectorizable, checking for

- /// intervening instructions which may affect the memory accessed by the

- /// instructions within Chain.

+ /// Runs the vectorizer on a "pseudo basic block", which is a range of

+ /// instructions [Begin, End) within one BB all of which have

+ /// isGuaranteedToTransferExecutionToSuccessor(I) == true.

+ bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);

+ /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores

+ /// in the same BB with the same value for getUnderlyingObject() etc.

+ bool runOnEquivalenceClass(const EqClassKey &EqClassKey,

+ ArrayRef<Instruction *> EqClass);

+ /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class

+ /// where all instructions access a known, constant offset from the first

+ /// instruction.

+ bool runOnChain(Chain &C);

+ /// Splits the chain into subchains of instructions which read/write a

+ /// contiguous block of memory. Discards any length-1 subchains (because

+ /// there's nothing to vectorize in there).

+ std::vector<Chain> splitChainByContiguity(Chain &C);

+ /// Splits the chain into subchains where it's safe to hoist loads up to the

+ /// beginning of the sub-chain and it's safe to sink loads up to the end of

+ /// the sub-chain. Discards any length-1 subchains.

+ std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);

+ /// Splits the chain into subchains that make legal, aligned accesses.

+ /// Discards any length-1 subchains.

+ std::vector<Chain> splitChainByAlignment(Chain &C);

+ /// Converts the instrs in the chain into a single vectorized load or store.

+ /// Adds the old scalar loads/stores to ToErase.

+ bool vectorizeChain(Chain &C);

+ /// Tries to compute the offset in bytes PtrB - PtrA.

+ std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,

+ Instruction *ContextInst,

+ unsigned Depth = 0);

+ std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,

+ Instruction *ContextInst,

+ unsigned Depth);

+ std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,

+ Instruction *ContextInst,

+ unsigned Depth);

+ /// Gets the element type of the vector that the chain will load or store.

+ /// This is nontrivial because the chain may contain elements of different

+ /// types; e.g. it's legal to have a chain that contains both i32 and float.

+ Type *getChainElemTy(const Chain &C);

+ /// Determines whether ChainElem can be moved up (if IsLoad) or down (if

+ /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias

+ /// instructions.

+ ///

+ /// The map ChainElemOffsets must contain all of the elements in

+ /// [ChainBegin, ChainElem] and their offsets from some arbitrary base

+ /// address. It's ok if it contains additional entries.

+ template <bool IsLoadChain>

+ bool isSafeToMove(

+ Instruction *ChainElem, Instruction *ChainBegin,

+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);

+ /// Collects loads and stores grouped by "equivalence class", where:

+ /// - all elements in an eq class are a load or all are a store,

+ /// - they all load/store the same element size (it's OK to have e.g. i8 and

+ /// <4 x i8> in the same class, but not i32 and <4 x i8>), and

+ /// - they all have the same value for getUnderlyingObject().

+ EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,

+ BasicBlock::iterator End);

+ /// Partitions Instrs into "chains" where every instruction has a known

+ /// constant offset from the first instr in the chain.

///

- /// The elements of \p Chain must be all loads or all stores and must be in

- /// address order.

- ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);

- /// Collects load and store instructions to vectorize.

- std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);

- /// Processes the collected instructions, the \p Map. The values of \p Map

- /// should be all loads or all stores.

- bool vectorizeChains(InstrListMap &Map);

- /// Finds the load/stores to consecutive memory addresses and vectorizes them.

- bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);

- /// Vectorizes the load instructions in Chain.

- bool

- vectorizeLoadChain(ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);

- /// Vectorizes the store instructions in Chain.

- bool

- vectorizeStoreChain(ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);

- /// Check if this load/store access is misaligned accesses.

- /// Returns a \p RelativeSpeed of an operation if allowed suitable to

- /// compare to another result for the same \p AddressSpace and potentially

- /// different \p Alignment and \p SzInBytes.

- bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,

- Align Alignment, unsigned &RelativeSpeed);

+ /// Postcondition: For all i, ret[i][0].second == 0, because the first instr

+ /// in the chain is the leader, and an instr touches distance 0 from itself.

+ std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);

};

class LoadStoreVectorizerLegacyPass : public FunctionPass {

@@ -198,7 +343,8 @@ public:

static char ID;

LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {

- initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());

+ initializeLoadStoreVectorizerLegacyPassPass(

+ *PassRegistry::getPassRegistry());

}

bool runOnFunction(Function &F) override;

@@ -250,11 +396,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {

AssumptionCache &AC =

getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

- Vectorizer V(F, AA, AC, DT, SE, TTI);

- return V.run();

+ return Vectorizer(F, AA, AC, DT, SE, TTI).run();

}

-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,

+ FunctionAnalysisManager &AM) {

// Don't vectorize when the attribute NoImplicitFloat is used.

if (F.hasFnAttribute(Attribute::NoImplicitFloat))

return PreservedAnalyses::all();

@@ -265,125 +411,681 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana

TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);

AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);

- Vectorizer V(F, AA, AC, DT, SE, TTI);

- bool Changed = V.run();

+ bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();

PreservedAnalyses PA;

PA.preserveSet<CFGAnalyses>();

return Changed ? PA : PreservedAnalyses::all();

}

-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in

-// vectors of Instructions.

-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {

- SmallVector<Value *, 8> VL(IL.begin(), IL.end());

- propagateMetadata(I, VL);

-// Vectorizer Implementation

bool Vectorizer::run() {

bool Changed = false;

- // Scan the blocks in the function in post order.

+ // Break up the BB if there are any instrs which aren't guaranteed to transfer

+ // execution to their successor.

+ //

+ // Consider, for example:

+ //

+ // def assert_arr_len(int n) { if (n < 2) exit(); }

+ //

+ // load arr[0]

+ // call assert_array_len(arr.length)

+ // load arr[1]

+ //

+ // Even though assert_arr_len does not read or write any memory, we can't

+ // speculate the second load before the call. More info at

+ // https://github.com/llvm/llvm-project/issues/52950.

for (BasicBlock *BB : post_order(&F)) {

- InstrListMap LoadRefs, StoreRefs;

- std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);

- Changed |= vectorizeChains(LoadRefs);

- Changed |= vectorizeChains(StoreRefs);

+ // BB must at least have a terminator.

+ assert(!BB->empty());

+ SmallVector<BasicBlock::iterator, 8> Barriers;

+ Barriers.push_back(BB->begin());

+ for (Instruction &I : *BB)

+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))

+ Barriers.push_back(I.getIterator());

+ Barriers.push_back(BB->end());

+ for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;

+ ++It)

+ Changed |= runOnPseudoBB(*It, *std::next(It));

+ for (Instruction *I : ToErase) {

+ auto *PtrOperand = getLoadStorePointerOperand(I);

+ if (I->use_empty())

+ I->eraseFromParent();

+ RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);

+ }

+ ToErase.clear();

}

return Changed;

}

-unsigned Vectorizer::getPointerAddressSpace(Value *I) {

- if (LoadInst *L = dyn_cast<LoadInst>(I))

- return L->getPointerAddressSpace();

- if (StoreInst *S = dyn_cast<StoreInst>(I))

- return S->getPointerAddressSpace();

- return -1;

+bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,

+ BasicBlock::iterator End) {

+ LLVM_DEBUG({

+ dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";

+ if (End != Begin->getParent()->end())

+ dbgs() << *End;

+ else

+ dbgs() << "<BB end>";

+ dbgs() << ")\n";

+ });

+ bool Changed = false;

+ for (const auto &[EqClassKey, EqClass] :

+ collectEquivalenceClasses(Begin, End))

+ Changed |= runOnEquivalenceClass(EqClassKey, EqClass);

+ return Changed;

}

-// FIXME: Merge with llvm::isConsecutiveAccess

-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {

- Value *PtrA = getLoadStorePointerOperand(A);

- Value *PtrB = getLoadStorePointerOperand(B);

- unsigned ASA = getPointerAddressSpace(A);

- unsigned ASB = getPointerAddressSpace(B);

+bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,

+ ArrayRef<Instruction *> EqClass) {

+ bool Changed = false;

- // Check that the address spaces match and that the pointers are valid.

- if (!PtrA || !PtrB || (ASA != ASB))

- return false;

+ LLVM_DEBUG({

+ dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()

+ << " keyed on " << EqClassKey << ":\n";

+ for (Instruction *I : EqClass)

+ dbgs() << " " << *I << "\n";

+ });

- // Make sure that A and B are different pointers of the same size type.

- Type *PtrATy = getLoadStoreType(A);

- Type *PtrBTy = getLoadStoreType(B);

- if (PtrA == PtrB ||

- PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||

- DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||

- DL.getTypeStoreSize(PtrATy->getScalarType()) !=

- DL.getTypeStoreSize(PtrBTy->getScalarType()))

- return false;

+ std::vector<Chain> Chains = gatherChains(EqClass);

+ LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()

+ << " nontrivial chains.\n";);

+ for (Chain &C : Chains)

+ Changed |= runOnChain(C);

+ return Changed;

- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);

- APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));

+bool Vectorizer::runOnChain(Chain &C) {

+ LLVM_DEBUG({

+ dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";

+ dumpChain(C);

+ });

- return areConsecutivePointers(PtrA, PtrB, Size);

+ // Split up the chain into increasingly smaller chains, until we can finally

+ // vectorize the chains.

+ //

+ // (Don't be scared by the depth of the loop nest here. These operations are

+ // all at worst O(n lg n) in the number of instructions, and splitting chains

+ // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)

+ bool Changed = false;

+ for (auto &C : splitChainByMayAliasInstrs(C))

+ for (auto &C : splitChainByContiguity(C))

+ for (auto &C : splitChainByAlignment(C))

+ Changed |= vectorizeChain(C);

+ return Changed;

}

-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,

- APInt PtrDelta, unsigned Depth) const {

- unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());

- APInt OffsetA(PtrBitWidth, 0);

- APInt OffsetB(PtrBitWidth, 0);

- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);

- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);

+std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {

+ if (C.empty())

+ return {};

- unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());

+ sortChainInBBOrder(C);

- if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))

+ LLVM_DEBUG({

+ dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";

+ dumpChain(C);

+ });

+ // We know that elements in the chain with nonverlapping offsets can't

+ // alias, but AA may not be smart enough to figure this out. Use a

+ // hashtable.

+ DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;

+ for (const auto &E : C)

+ ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});

+ // Loads get hoisted up to the first load in the chain. Stores get sunk

+ // down to the last store in the chain. Our algorithm for loads is:

+ //

+ // - Take the first element of the chain. This is the start of a new chain.

+ // - Take the next element of `Chain` and check for may-alias instructions

+ // up to the start of NewChain. If no may-alias instrs, add it to

+ // NewChain. Otherwise, start a new NewChain.

+ //

+ // For stores it's the same except in the reverse direction.

+ //

+ // We expect IsLoad to be an std::bool_constant.

+ auto Impl = [&](auto IsLoad) {

+ // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.

+ auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {

+ if constexpr (IsLoad())

+ return std::make_pair(C.begin(), C.end());

+ else

+ return std::make_pair(C.rbegin(), C.rend());

+ }(IsLoad);

+ assert(ChainBegin != ChainEnd);

+ std::vector<Chain> Chains;

+ SmallVector<ChainElem, 1> NewChain;

+ NewChain.push_back(*ChainBegin);

+ for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {

+ if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,

+ ChainOffsets)) {

+ LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "

+ << *ChainIt->Inst << " into " << *ChainBegin->Inst

+ << "\n");

+ NewChain.push_back(*ChainIt);

+ } else {

+ LLVM_DEBUG(

+ dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "

+ << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");

+ if (NewChain.size() > 1) {

+ LLVM_DEBUG({

+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";

+ dumpChain(NewChain);

+ });

+ Chains.push_back(std::move(NewChain));

+ }

+ // Start a new chain.

+ NewChain = SmallVector<ChainElem, 1>({*ChainIt});

+ }

+ if (NewChain.size() > 1) {

+ LLVM_DEBUG({

+ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";

+ dumpChain(NewChain);

+ });

+ Chains.push_back(std::move(NewChain));

+ }

+ return Chains;

+ };

+ if (isa<LoadInst>(C[0].Inst))

+ return Impl(/*IsLoad=*/std::bool_constant<true>());

+ assert(isa<StoreInst>(C[0].Inst));

+ return Impl(/*IsLoad=*/std::bool_constant<false>());

+std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {

+ if (C.empty())

+ return {};

+ sortChainInOffsetOrder(C);

+ LLVM_DEBUG({

+ dbgs() << "LSV: splitChainByContiguity considering chain:\n";

+ dumpChain(C);

+ });

+ std::vector<Chain> Ret;

+ Ret.push_back({C.front()});

+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {

+ // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).

+ auto &CurChain = Ret.back();

+ const ChainElem &Prev = CurChain.back();

+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));

+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "

+ "collectEquivalenceClass");

+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;

+ // Add this instruction to the end of the current chain, or start a new one.

+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;

+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "

+ << (AreContiguous ? "" : "not ") << "contiguous: "

+ << *Prev.Inst << " (ends at offset " << PrevReadEnd

+ << ") -> " << *It->Inst << " (starts at offset "

+ << It->OffsetFromLeader << ")\n");

+ if (AreContiguous)

+ CurChain.push_back(*It);

+ else

+ Ret.push_back({*It});

+ }

+ // Filter out length-1 chains, these are uninteresting.

+ llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });

+ return Ret;

+Type *Vectorizer::getChainElemTy(const Chain &C) {

+ assert(!C.empty());

+ // The rules are:

+ // - If there are any pointer types in the chain, use an integer type.

+ // - Prefer an integer type if it appears in the chain.

+ // - Otherwise, use the first type in the chain.

+ //

+ // The rule about pointer types is a simplification when we merge e.g. a load

+ // of a ptr and a double. There's no direct conversion from a ptr to a

+ // double; it requires a ptrtoint followed by a bitcast.

+ //

+ // It's unclear to me if the other rules have any practical effect, but we do

+ // it to match this pass's previous behavior.

+ if (any_of(C, [](const ChainElem &E) {

+ return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();

+ })) {

+ return Type::getIntNTy(

+ F.getContext(),

+ DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));

+ }

+ for (const ChainElem &E : C)

+ if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())

+ return T;

+ return getLoadStoreType(C[0].Inst)->getScalarType();

+std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {

+ // We use a simple greedy algorithm.

+ // - Given a chain of length N, find all prefixes that

+ // (a) are not longer than the max register length, and

+ // (b) are a power of 2.

+ // - Starting from the longest prefix, try to create a vector of that length.

+ // - If one of them works, great. Repeat the algorithm on any remaining

+ // elements in the chain.

+ // - If none of them work, discard the first element and repeat on a chain

+ // of length N-1.

+ if (C.empty())

+ return {};

+ sortChainInOffsetOrder(C);

+ LLVM_DEBUG({

+ dbgs() << "LSV: splitChainByAlignment considering chain:\n";

+ dumpChain(C);

+ });

+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);

+ auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,

+ unsigned ChainSizeBytes, VectorType *VecTy) {

+ return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,

+ ChainSizeBytes, VecTy)

+ : TTI.getStoreVectorFactor(VF, LoadStoreSize,

+ ChainSizeBytes, VecTy);

+ };

+#ifndef NDEBUG

+ for (const auto &E : C) {

+ Type *Ty = getLoadStoreType(E.Inst)->getScalarType();

+ assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&

+ "Should have filtered out non-power-of-two elements in "

+ "collectEquivalenceClasses.");

+ }

+#endif

+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);

+ unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;

+ std::vector<Chain> Ret;

+ for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {

+ // Find candidate chains of size not greater than the largest vector reg.

+ // These chains are over the closed interval [CBegin, CEnd].

+ SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>

+ CandidateChains;

+ for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {

+ APInt Sz = C[CEnd].OffsetFromLeader +

+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -

+ C[CBegin].OffsetFromLeader;

+ if (Sz.sgt(VecRegBytes))

+ break;

+ CandidateChains.push_back(

+ {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});

+ }

+ // Consider the longest chain first.

+ for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();

+ It != End; ++It) {

+ auto [CEnd, SizeBytes] = *It;

+ LLVM_DEBUG(

+ dbgs() << "LSV: splitChainByAlignment considering candidate chain ["

+ << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");

+ Type *VecElemTy = getChainElemTy(C);

+ // Note, VecElemTy is a power of 2, but might be less than one byte. For

+ // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case

+ // VecElemTy would be i4.

+ unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);

+ // SizeBytes and VecElemBits are powers of 2, so they divide evenly.

+ assert((8 * SizeBytes) % VecElemBits == 0);

+ unsigned NumVecElems = 8 * SizeBytes / VecElemBits;

+ FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);

+ unsigned VF = 8 * VecRegBytes / VecElemBits;

+ // Check that TTI is happy with this vectorization factor.

+ unsigned TargetVF = getVectorFactor(VF, VecElemBits,

+ VecElemBits * NumVecElems / 8, VecTy);

+ if (TargetVF != VF && TargetVF < NumVecElems) {

+ LLVM_DEBUG(

+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "

+ "because TargetVF="

+ << TargetVF << " != VF=" << VF

+ << " and TargetVF < NumVecElems=" << NumVecElems << "\n");

+ continue;

+ }

+ // Is a load/store with this alignment allowed by TTI and at least as fast

+ // as an unvectorized load/store?

+ //

+ // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).

+ auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,

+ &F = F](Align Alignment) {

+ if (Alignment.value() % SizeBytes == 0)

+ return true;

+ unsigned VectorizedSpeed = 0;

+ bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(

+ F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);

+ if (!AllowsMisaligned) {

+ LLVM_DEBUG(dbgs()

+ << "LSV: Access of " << SizeBytes << "B in addrspace "

+ << AS << " with alignment " << Alignment.value()

+ << " is misaligned, and therefore can't be vectorized.\n");

+ return false;

+ }

+ unsigned ElementwiseSpeed = 0;

+ (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,

+ Alignment, &ElementwiseSpeed);

+ if (VectorizedSpeed < ElementwiseSpeed) {

+ LLVM_DEBUG(dbgs()

+ << "LSV: Access of " << SizeBytes << "B in addrspace "

+ << AS << " with alignment " << Alignment.value()

+ << " has relative speed " << VectorizedSpeed

+ << ", which is lower than the elementwise speed of "

+ << ElementwiseSpeed

+ << ". Therefore this access won't be vectorized.\n");

+ return false;

+ }

+ return true;

+ };

+ // If we're loading/storing from an alloca, align it if possible.

+ //

+ // FIXME: We eagerly upgrade the alignment, regardless of whether TTI

+ // tells us this is beneficial. This feels a bit odd, but it matches

+ // existing tests. This isn't *so* bad, because at most we align to 4

+ // bytes (current value of StackAdjustedAlignment).

+ //

+ // FIXME: We will upgrade the alignment of the alloca even if it turns out

+ // we can't vectorize for some other reason.

+ Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);

+ bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&

+ isa<AllocaInst>(PtrOperand->stripPointerCasts());

+ Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);

+ Align PrefAlign = Align(StackAdjustedAlignment);

+ if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&

+ IsAllowedAndFast(PrefAlign)) {

+ Align NewAlign = getOrEnforceKnownAlignment(

+ PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);

+ if (NewAlign >= Alignment) {

+ LLVM_DEBUG(dbgs()

+ << "LSV: splitByChain upgrading alloca alignment from "

+ << Alignment.value() << " to " << NewAlign.value()

+ << "\n");

+ Alignment = NewAlign;

+ }

+ if (!IsAllowedAndFast(Alignment)) {

+ LLVM_DEBUG(

+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "

+ "because its alignment is not AllowedAndFast: "

+ << Alignment.value() << "\n");

+ continue;

+ }

+ if ((IsLoadChain &&

+ !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||

+ (!IsLoadChain &&

+ !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {

+ LLVM_DEBUG(

+ dbgs() << "LSV: splitChainByAlignment discarding candidate chain "

+ "because !isLegalToVectorizeLoad/StoreChain.");

+ continue;

+ }

+ // Hooray, we can vectorize this chain!

+ Chain &NewChain = Ret.emplace_back();

+ for (unsigned I = CBegin; I <= CEnd; ++I)

+ NewChain.push_back(C[I]);

+ CBegin = CEnd; // Skip over the instructions we've added to the chain.

+ break;

+ }

+ return Ret;

+bool Vectorizer::vectorizeChain(Chain &C) {

+ if (C.size() < 2)

return false;

- // In case if we have to shrink the pointer

- // stripAndAccumulateInBoundsConstantOffsets should properly handle a

- // possible overflow and the value should fit into a smallest data type

- // used in the cast/gep chain.

- assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&

- OffsetB.getMinSignedBits() <= NewPtrBitWidth);

+ sortChainInOffsetOrder(C);

- OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);

- OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);

- PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);

+ LLVM_DEBUG({

+ dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";

+ dumpChain(C);

+ });

- APInt OffsetDelta = OffsetB - OffsetA;

+ Type *VecElemTy = getChainElemTy(C);

+ bool IsLoadChain = isa<LoadInst>(C[0].Inst);

+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);

+ unsigned ChainBytes = std::accumulate(

+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {

+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));

+ });

+ assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);

+ // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller

+ // than 1 byte (e.g. VecTy == <32 x i1>).

+ Type *VecTy = FixedVectorType::get(

+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));

+ Align Alignment = getLoadStoreAlignment(C[0].Inst);

+ // If this is a load/store of an alloca, we might have upgraded the alloca's

+ // alignment earlier. Get the new alignment.

+ if (AS == DL.getAllocaAddrSpace()) {

+ Alignment = std::max(

+ Alignment,

+ getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),

+ MaybeAlign(), DL, C[0].Inst, nullptr, &DT));

+ }

- // Check if they are based on the same pointer. That makes the offsets

- // sufficient.

- if (PtrA == PtrB)

- return OffsetDelta == PtrDelta;

- // Compute the necessary base pointer delta to have the necessary final delta

- // equal to the pointer delta requested.

- APInt BaseDelta = PtrDelta - OffsetDelta;

- // Compute the distance with SCEV between the base pointers.

- const SCEV *PtrSCEVA = SE.getSCEV(PtrA);

- const SCEV *PtrSCEVB = SE.getSCEV(PtrB);

- const SCEV *C = SE.getConstant(BaseDelta);

- const SCEV *X = SE.getAddExpr(PtrSCEVA, C);

- if (X == PtrSCEVB)

+ // All elements of the chain must have the same scalar-type size.

+#ifndef NDEBUG

+ for (const ChainElem &E : C)

+ assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==

+ DL.getTypeStoreSize(VecElemTy));

+#endif

+ Instruction *VecInst;

+ if (IsLoadChain) {

+ // Loads get hoisted to the location of the first load in the chain. We may

+ // also need to hoist the (transitive) operands of the loads.

+ Builder.SetInsertPoint(

+ std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {

+ return A.Inst->comesBefore(B.Inst);

+ })->Inst);

+ // Chain is in offset order, so C[0] is the instr with the lowest offset,

+ // i.e. the root of the vector.

+ Value *Bitcast = Builder.CreateBitCast(

+ getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));

+ VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);

+ unsigned VecIdx = 0;

+ for (const ChainElem &E : C) {

+ Instruction *I = E.Inst;

+ Value *V;

+ Type *T = getLoadStoreType(I);

+ if (auto *VT = dyn_cast<FixedVectorType>(T)) {

+ auto Mask = llvm::to_vector<8>(

+ llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));

+ V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());

+ VecIdx += VT->getNumElements();

+ } else {

+ V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),

+ I->getName());

+ ++VecIdx;

+ }

+ if (V->getType() != I->getType())

+ V = Builder.CreateBitOrPointerCast(V, I->getType());

+ I->replaceAllUsesWith(V);

+ }

+ // Finally, we need to reorder the instrs in the BB so that the (transitive)

+ // operands of VecInst appear before it. To see why, suppose we have

+ // vectorized the following code:

+ //

+ // ptr1 = gep a, 1

+ // load1 = load i32 ptr1

+ // ptr0 = gep a, 0

+ // load0 = load i32 ptr0

+ //

+ // We will put the vectorized load at the location of the earliest load in

+ // the BB, i.e. load1. We get:

+ //

+ // ptr1 = gep a, 1

+ // loadv = load <2 x i32> ptr0

+ // load0 = extractelement loadv, 0

+ // load1 = extractelement loadv, 1

+ // ptr0 = gep a, 0

+ //

+ // Notice that loadv uses ptr0, which is defined *after* it!

+ reorder(VecInst);

+ } else {

+ // Stores get sunk to the location of the last store in the chain.

+ Builder.SetInsertPoint(

+ std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {

+ return A.Inst->comesBefore(B.Inst);

+ })->Inst);

+ // Build the vector to store.

+ Value *Vec = PoisonValue::get(VecTy);

+ unsigned VecIdx = 0;

+ auto InsertElem = [&](Value *V) {

+ if (V->getType() != VecElemTy)

+ V = Builder.CreateBitOrPointerCast(V, VecElemTy);

+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));

+ };

+ for (const ChainElem &E : C) {

+ auto I = cast<StoreInst>(E.Inst);

+ if (FixedVectorType *VT =

+ dyn_cast<FixedVectorType>(getLoadStoreType(I))) {

+ for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {

+ InsertElem(Builder.CreateExtractElement(I->getValueOperand(),

+ Builder.getInt32(J)));

+ }

+ } else {

+ InsertElem(I->getValueOperand());

+ }

+ // Chain is in offset order, so C[0] is the instr with the lowest offset,

+ // i.e. the root of the vector.

+ VecInst = Builder.CreateAlignedStore(

+ Vec,

+ Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),

+ VecTy->getPointerTo(AS)),

+ Alignment);

+ }

+ propagateMetadata(VecInst, C);

+ for (const ChainElem &E : C)

+ ToErase.push_back(E.Inst);

+ ++NumVectorInstructions;

+ NumScalarsVectorized += C.size();

+ return true;

+template <bool IsLoadChain>

+bool Vectorizer::isSafeToMove(

+ Instruction *ChainElem, Instruction *ChainBegin,

+ const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {

+ LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "

+ << *ChainBegin << ")\n");

+ assert(isa<LoadInst>(ChainElem) == IsLoadChain);

+ if (ChainElem == ChainBegin)

return true;

- // The above check will not catch the cases where one of the pointers is

- // factorized but the other one is not, such as (C + (S * (A + B))) vs

- // (AS + BS). Get the minus scev. That will allow re-combining the expresions

- // and getting the simplified difference.

- const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);

- if (C == Dist)

+ // Invariant loads can always be reordered; by definition they are not

+ // clobbered by stores.

+ if (isInvariantLoad(ChainElem))

return true;

- // Sometimes even this doesn't work, because SCEV can't always see through

- // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking

- // things the hard way.

- return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);

+ auto BBIt = std::next([&] {

+ if constexpr (IsLoadChain)

+ return BasicBlock::reverse_iterator(ChainElem);

+ else

+ return BasicBlock::iterator(ChainElem);

+ }());

+ auto BBItEnd = std::next([&] {

+ if constexpr (IsLoadChain)

+ return BasicBlock::reverse_iterator(ChainBegin);

+ else

+ return BasicBlock::iterator(ChainBegin);

+ }());

+ const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);

+ const unsigned ChainElemSize =

+ DL.getTypeStoreSize(getLoadStoreType(ChainElem));

+ for (; BBIt != BBItEnd; ++BBIt) {

+ Instruction *I = &*BBIt;

+ if (!I->mayReadOrWriteMemory())

+ continue;

+ // Loads can be reordered with other loads.

+ if (IsLoadChain && isa<LoadInst>(I))

+ continue;

+ // Stores can be sunk below invariant loads.

+ if (!IsLoadChain && isInvariantLoad(I))

+ continue;

+ // If I is in the chain, we can tell whether it aliases ChainIt by checking

+ // what offset ChainIt accesses. This may be better than AA is able to do.

+ //

+ // We should really only have duplicate offsets for stores (the duplicate

+ // loads should be CSE'ed), but in case we have a duplicate load, we'll

+ // split the chain so we don't have to handle this case specially.

+ if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {

+ // I and ChainElem overlap if:

+ // - I and ChainElem have the same offset, OR

+ // - I's offset is less than ChainElem's, but I touches past the

+ // beginning of ChainElem, OR

+ // - ChainElem's offset is less than I's, but ChainElem touches past the

+ // beginning of I.

+ const APInt &IOffset = OffsetIt->second;

+ unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));

+ if (IOffset == ChainElemOffset ||

+ (IOffset.sle(ChainElemOffset) &&

+ (IOffset + IElemSize).sgt(ChainElemOffset)) ||

+ (ChainElemOffset.sle(IOffset) &&

+ (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {

+ LLVM_DEBUG({

+ // Double check that AA also sees this alias. If not, we probably

+ // have a bug.

+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));

+ assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));

+ dbgs() << "LSV: Found alias in chain: " << *I << "\n";

+ });

+ return false; // We found an aliasing instruction; bail.

+ }

+ continue; // We're confident there's no alias.

+ }

+ LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");

+ ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));

+ if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {

+ LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"

+ << " Aliasing instruction:\n"

+ << " " << *I << '\n'

+ << " Aliased instruction and pointer:\n"

+ << " " << *ChainElem << '\n'

+ << " " << *getLoadStorePointerOperand(ChainElem)

+ << '\n');

+ return false;

+ }

+ return true;

}

static bool checkNoWrapFlags(Instruction *I, bool Signed) {

@@ -395,10 +1097,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) {

static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,

unsigned MatchingOpIdxA, Instruction *AddOpB,

unsigned MatchingOpIdxB, bool Signed) {

- // If both OpA and OpB is an add with NSW/NUW and with

- // one of the operands being the same, we can guarantee that the

- // transformation is safe if we can prove that OpA won't overflow when

- // IdxDiff added to the other operand of OpA.

+ LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff

+ << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="

+ << MatchingOpIdxA << ", AddOpB=" << *AddOpB

+ << ", MatchingOpIdxB=" << MatchingOpIdxB

+ << ", Signed=" << Signed << "\n");

+ // If both OpA and OpB are adds with NSW/NUW and with one of the operands

+ // being the same, we can guarantee that the transformation is safe if we can

+ // prove that OpA won't overflow when Ret added to the other operand of OpA.

// For example:

// %tmp7 = add nsw i32 %tmp2, %v0

// %tmp8 = sext i32 %tmp7 to i64

@@ -407,10 +1113,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,

// %tmp12 = add nsw i32 %tmp2, %tmp11

// %tmp13 = sext i32 %tmp12 to i64

- // Both %tmp7 and %tmp2 has the nsw flag and the first operand

- // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow

- // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the

- // nsw flag.

+ // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.

+ // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds

+ // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.

assert(AddOpA->getOpcode() == Instruction::Add &&

AddOpB->getOpcode() == Instruction::Add &&

checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));

@@ -461,24 +1166,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,

return false;

}

-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

- APInt PtrDelta,

- unsigned Depth) const {

+std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(

+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {

+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA

+ << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst

+ << " Depth=" << Depth << "\n");

auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);

auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);

if (!GEPA || !GEPB)

- return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);

+ return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);

// Look through GEPs after checking they're the same except for the last

// index.

if (GEPA->getNumOperands() != GEPB->getNumOperands() ||

GEPA->getPointerOperand() != GEPB->getPointerOperand())

- return false;

+ return std::nullopt;

gep_type_iterator GTIA = gep_type_begin(GEPA);

gep_type_iterator GTIB = gep_type_begin(GEPB);

for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {

if (GTIA.getOperand() != GTIB.getOperand())

- return false;

+ return std::nullopt;

++GTIA;

++GTIB;

}

@@ -487,23 +1194,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());

if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||

OpA->getType() != OpB->getType())

- return false;

+ return std::nullopt;

- if (PtrDelta.isNegative()) {

- if (PtrDelta.isMinSignedValue())

- return false;

- PtrDelta.negate();

- std::swap(OpA, OpB);

- }

uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());

- if (PtrDelta.urem(Stride) != 0)

- return false;

- unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();

- APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);

// Only look through a ZExt/SExt.

if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))

- return false;

+ return std::nullopt;

bool Signed = isa<SExtInst>(OpA);

@@ -511,7 +1208,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

Value *ValA = OpA->getOperand(0);

OpB = dyn_cast<Instruction>(OpB->getOperand(0));

if (!OpB || ValA->getType() != OpB->getType())

- return false;

+ return std::nullopt;

+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);

+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);

+ const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);

+ if (IdxDiffSCEV == SE.getCouldNotCompute())

+ return std::nullopt;

+ ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);

+ if (!IdxDiffRange.isSingleElement())

+ return std::nullopt;

+ APInt IdxDiff = *IdxDiffRange.getSingleElement();

+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff

+ << "\n");

// Now we need to prove that adding IdxDiff to ValA won't overflow.

bool Safe = false;

@@ -530,10 +1241,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&

OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&

checkNoWrapFlags(OpB, Signed)) {

- // In the checks below a matching operand in OpA and OpB is

- // an operand which is the same in those two instructions.

- // Below we account for possible orders of the operands of

- // these add instructions.

+ // In the checks below a matching operand in OpA and OpB is an operand which

+ // is the same in those two instructions. Below we account for possible

+ // orders of the operands of these add instructions.

for (unsigned MatchingOpIdxA : {0, 1})

for (unsigned MatchingOpIdxB : {0, 1})

if (!Safe)

@@ -544,802 +1254,267 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

unsigned BitWidth = ValA->getType()->getScalarSizeInBits();

// Third attempt:

- // If all set bits of IdxDiff or any higher order bit other than the sign bit

- // are known to be zero in ValA, we can add Diff to it while guaranteeing no

- // overflow of any sort.

+ //

+ // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher

+ // order bit other than the sign bit are known to be zero in ValA, we can add

+ // Diff to it while guaranteeing no overflow of any sort.

+ //

+ // If IdxDiff is negative, do the same, but swap ValA and ValB.

if (!Safe) {

+ // When computing known bits, use the GEPs as context instructions, since

+ // they likely are in the same BB as the load/store.

KnownBits Known(BitWidth);

- computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);

+ computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,

+ ContextInst, &DT);

APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());

if (Signed)

BitsAllowedToBeSet.clearBit(BitWidth - 1);

- if (BitsAllowedToBeSet.ult(IdxDiff))

- return false;

+ if (BitsAllowedToBeSet.ult(IdxDiff.abs()))

+ return std::nullopt;

+ Safe = true;

}

- const SCEV *OffsetSCEVA = SE.getSCEV(ValA);

- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);

- const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));

- const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);

- return X == OffsetSCEVB;

+ if (Safe)

+ return IdxDiff * Stride;

+ return std::nullopt;

}

-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,

- const APInt &PtrDelta,

- unsigned Depth) const {

+std::optional<APInt> Vectorizer::getConstantOffsetSelects(

+ Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {

if (Depth++ == MaxDepth)

- return false;

+ return std::nullopt;

if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {

if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {

- return SelectA->getCondition() == SelectB->getCondition() &&

- areConsecutivePointers(SelectA->getTrueValue(),

- SelectB->getTrueValue(), PtrDelta, Depth) &&

- areConsecutivePointers(SelectA->getFalseValue(),

- SelectB->getFalseValue(), PtrDelta, Depth);

+ if (SelectA->getCondition() != SelectB->getCondition())

+ return std::nullopt;

+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA

+ << ", PtrB=" << *PtrB << ", ContextInst="

+ << *ContextInst << ", Depth=" << Depth << "\n");

+ std::optional<APInt> TrueDiff = getConstantOffset(

+ SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);

+ if (!TrueDiff.has_value())

+ return std::nullopt;

+ std::optional<APInt> FalseDiff =

+ getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),

+ ContextInst, Depth);

+ if (TrueDiff == FalseDiff)

+ return TrueDiff;

}

- return false;

+ return std::nullopt;

}

-void Vectorizer::reorder(Instruction *I) {

- SmallPtrSet<Instruction *, 16> InstructionsToMove;

- SmallVector<Instruction *, 16> Worklist;

- Worklist.push_back(I);

- while (!Worklist.empty()) {

- Instruction *IW = Worklist.pop_back_val();

- int NumOperands = IW->getNumOperands();

- for (int i = 0; i < NumOperands; i++) {

- Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));

- if (!IM || IM->getOpcode() == Instruction::PHI)

- continue;

- // If IM is in another BB, no need to move it, because this pass only

- // vectorizes instructions within one BB.

- if (IM->getParent() != I->getParent())

- continue;

- if (!IM->comesBefore(I)) {

- InstructionsToMove.insert(IM);

- Worklist.push_back(IM);

- }

+EquivalenceClassMap

+Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,

+ BasicBlock::iterator End) {

+ EquivalenceClassMap Ret;

+ auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {

+ const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);

+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {

+ // The select's themselves are distinct instructions even if they share

+ // the same condition and evaluate to consecutive pointers for true and

+ // false values of the condition. Therefore using the select's themselves

+ // for grouping instructions would put consecutive accesses into different

+ // lists and they won't be even checked for being consecutive, and won't

+ // be vectorized.

+ return Sel->getCondition();

}

- }

+ return ObjPtr;

+ };

- // All instructions to move should follow I. Start from I, not from begin().

- for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;

- ++BBI) {

- if (!InstructionsToMove.count(&*BBI))

+ for (Instruction &I : make_range(Begin, End)) {

+ auto *LI = dyn_cast<LoadInst>(&I);

+ auto *SI = dyn_cast<StoreInst>(&I);

+ if (!LI && !SI)

continue;

- Instruction *IM = &*BBI;

- --BBI;

- IM->removeFromParent();

- IM->insertBefore(I);

- }

-std::pair<BasicBlock::iterator, BasicBlock::iterator>

-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {

- Instruction *C0 = Chain[0];

- BasicBlock::iterator FirstInstr = C0->getIterator();

- BasicBlock::iterator LastInstr = C0->getIterator();

- BasicBlock *BB = C0->getParent();

- unsigned NumFound = 0;

- for (Instruction &I : *BB) {

- if (!is_contained(Chain, &I))

+ if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))

continue;

- ++NumFound;

- if (NumFound == 1) {

- FirstInstr = I.getIterator();

- }

- if (NumFound == Chain.size()) {

- LastInstr = I.getIterator();

- break;

- }

- // Range is [first, last).

- return std::make_pair(FirstInstr, ++LastInstr);

-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {

- SmallVector<Instruction *, 16> Instrs;

- for (Instruction *I : Chain) {

- Value *PtrOperand = getLoadStorePointerOperand(I);

- assert(PtrOperand && "Instruction must have a pointer operand.");

- Instrs.push_back(I);

- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))

- Instrs.push_back(GEP);

- }

- // Erase instructions.

- for (Instruction *I : Instrs)

- if (I->use_empty())

- I->eraseFromParent();

-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>

-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,

- unsigned ElementSizeBits) {

- unsigned ElementSizeBytes = ElementSizeBits / 8;

- unsigned SizeBytes = ElementSizeBytes * Chain.size();

- unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;

- if (NumLeft == Chain.size()) {

- if ((NumLeft & 1) == 0)

- NumLeft /= 2; // Split even in half

- else

- --NumLeft; // Split off last element

- } else if (NumLeft == 0)

- NumLeft = 1;

- return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));

-ArrayRef<Instruction *>

-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {

- // These are in BB order, unlike Chain, which is in address order.

- SmallVector<Instruction *, 16> MemoryInstrs;

- SmallVector<Instruction *, 16> ChainInstrs;

- bool IsLoadChain = isa<LoadInst>(Chain[0]);

- LLVM_DEBUG({

- for (Instruction *I : Chain) {

- if (IsLoadChain)

- assert(isa<LoadInst>(I) &&

- "All elements of Chain must be loads, or all must be stores.");

- else

- assert(isa<StoreInst>(I) &&

- "All elements of Chain must be loads, or all must be stores.");

- }

- });

- for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {

- if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {

- ChainInstrs.push_back(&I);

+ if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||

+ (SI && !TTI.isLegalToVectorizeStore(SI)))

continue;

- }

- if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {

- LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "

- << I << '\n');

- break;

- }

- if (I.mayReadOrWriteMemory())

- MemoryInstrs.push_back(&I);

- }

- // Loop until we find an instruction in ChainInstrs that we can't vectorize.

- unsigned ChainInstrIdx = 0;

- Instruction *BarrierMemoryInstr = nullptr;

- for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {

- Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];

- // If a barrier memory instruction was found, chain instructions that follow

- // will not be added to the valid prefix.

- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))

- break;

- // Check (in BB order) if any instruction prevents ChainInstr from being

- // vectorized. Find and store the first such "conflicting" instruction.

- for (Instruction *MemInstr : MemoryInstrs) {

- // If a barrier memory instruction was found, do not check past it.

- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))

- break;

- auto *MemLoad = dyn_cast<LoadInst>(MemInstr);

- auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);

- if (MemLoad && ChainLoad)

- continue;

- // We can ignore the alias if the we have a load store pair and the load

- // is known to be invariant. The load cannot be clobbered by the store.

- auto IsInvariantLoad = [](const LoadInst *LI) -> bool {

- return LI->hasMetadata(LLVMContext::MD_invariant_load);

- };

- if (IsLoadChain) {

- // We can ignore the alias as long as the load comes before the store,

- // because that means we won't be moving the load past the store to

- // vectorize it (the vectorized load is inserted at the location of the

- // first load in the chain).

- if (ChainInstr->comesBefore(MemInstr) ||

- (ChainLoad && IsInvariantLoad(ChainLoad)))

- continue;

- } else {

- // Same case, but in reverse.

- if (MemInstr->comesBefore(ChainInstr) ||

- (MemLoad && IsInvariantLoad(MemLoad)))

- continue;

- }

- ModRefInfo MR =

- AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));

- if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {

- LLVM_DEBUG({

- dbgs() << "LSV: Found alias:\n"

- " Aliasing instruction:\n"

- << " " << *MemInstr << '\n'

- << " Aliased instruction and pointer:\n"

- << " " << *ChainInstr << '\n'

- << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';

- });

- // Save this aliasing memory instruction as a barrier, but allow other

- // instructions that precede the barrier to be vectorized with this one.

- BarrierMemoryInstr = MemInstr;

- break;

- }

- // Continue the search only for store chains, since vectorizing stores that

- // precede an aliasing load is valid. Conversely, vectorizing loads is valid

- // up to an aliasing store, but should not pull loads from further down in

- // the basic block.

- if (IsLoadChain && BarrierMemoryInstr) {

- // The BarrierMemoryInstr is a store that precedes ChainInstr.

- assert(BarrierMemoryInstr->comesBefore(ChainInstr));

- break;

- }

- // Find the largest prefix of Chain whose elements are all in

- // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of

- // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB

- // order.)

- SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(

- ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);

- unsigned ChainIdx = 0;

- for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {

- if (!VectorizableChainInstrs.count(Chain[ChainIdx]))

- break;

- }

- return Chain.slice(0, ChainIdx);

-static ChainID getChainID(const Value *Ptr) {

- const Value *ObjPtr = getUnderlyingObject(Ptr);

- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {

- // The select's themselves are distinct instructions even if they share the

- // same condition and evaluate to consecutive pointers for true and false

- // values of the condition. Therefore using the select's themselves for

- // grouping instructions would put consecutive accesses into different lists

- // and they won't be even checked for being consecutive, and won't be

- // vectorized.

- return Sel->getCondition();

- }

- return ObjPtr;

-std::pair<InstrListMap, InstrListMap>

-Vectorizer::collectInstructions(BasicBlock *BB) {

- InstrListMap LoadRefs;

- InstrListMap StoreRefs;

- for (Instruction &I : *BB) {

- if (!I.mayReadOrWriteMemory())

+ Type *Ty = getLoadStoreType(&I);

+ if (!VectorType::isValidElementType(Ty->getScalarType()))

continue;

- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {

- if (!LI->isSimple())

- continue;

- // Skip if it's not legal.

- if (!TTI.isLegalToVectorizeLoad(LI))

- continue;

- Type *Ty = LI->getType();

- if (!VectorType::isValidElementType(Ty->getScalarType()))

- continue;

- // Skip weird non-byte sizes. They probably aren't worth the effort of

- // handling correctly.

- unsigned TySize = DL.getTypeSizeInBits(Ty);

- if ((TySize % 8) != 0)

- continue;

- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain

- // functions are currently using an integer type for the vectorized

- // load/store, and does not support casting between the integer type and a

- // vector of pointers (e.g. i64 to <2 x i16*>)

- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())

- continue;

- Value *Ptr = LI->getPointerOperand();

- unsigned AS = Ptr->getType()->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / TySize;

- VectorType *VecTy = dyn_cast<VectorType>(Ty);

- // No point in looking at these if they're too big to vectorize.

- if (TySize > VecRegSize / 2 ||

- (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))

- continue;

- // Save the load locations.

- const ChainID ID = getChainID(Ptr);

- LoadRefs[ID].push_back(LI);

- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {

- if (!SI->isSimple())

- continue;

- // Skip if it's not legal.

- if (!TTI.isLegalToVectorizeStore(SI))

- continue;

- Type *Ty = SI->getValueOperand()->getType();

- if (!VectorType::isValidElementType(Ty->getScalarType()))

- continue;

- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain

- // functions are currently using an integer type for the vectorized

- // load/store, and does not support casting between the integer type and a

- // vector of pointers (e.g. i64 to <2 x i16*>)

- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())

- continue;

- // Skip weird non-byte sizes. They probably aren't worth the effort of

- // handling correctly.

- unsigned TySize = DL.getTypeSizeInBits(Ty);

- if ((TySize % 8) != 0)

- continue;

- Value *Ptr = SI->getPointerOperand();

- unsigned AS = Ptr->getType()->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / TySize;

- VectorType *VecTy = dyn_cast<VectorType>(Ty);

- // No point in looking at these if they're too big to vectorize.

- if (TySize > VecRegSize / 2 ||

- (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))

- continue;

- // Save store location.

- const ChainID ID = getChainID(Ptr);

- StoreRefs[ID].push_back(SI);

- }

- return {LoadRefs, StoreRefs};

-bool Vectorizer::vectorizeChains(InstrListMap &Map) {

- bool Changed = false;

- for (const std::pair<ChainID, InstrList> &Chain : Map) {

- unsigned Size = Chain.second.size();

- if (Size < 2)

+ // Skip weird non-byte sizes. They probably aren't worth the effort of

+ // handling correctly.

+ unsigned TySize = DL.getTypeSizeInBits(Ty);

+ if ((TySize % 8) != 0)

continue;

- LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");

- // Process the stores in chunks of 64.

- for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {

- unsigned Len = std::min<unsigned>(CE - CI, 64);

- ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);

- Changed |= vectorizeInstructions(Chunk);

- }

- return Changed;

-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {

- LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()

- << " instructions.\n");

- SmallVector<int, 16> Heads, Tails;

- int ConsecutiveChain[64];

- // Do a quadratic search on all of the given loads/stores and find all of the

- // pairs of loads/stores that follow each other.

- for (int i = 0, e = Instrs.size(); i < e; ++i) {

- ConsecutiveChain[i] = -1;

- for (int j = e - 1; j >= 0; --j) {

- if (i == j)

- continue;

- if (isConsecutiveAccess(Instrs[i], Instrs[j])) {

- if (ConsecutiveChain[i] != -1) {

- int CurDistance = std::abs(ConsecutiveChain[i] - i);

- int NewDistance = std::abs(ConsecutiveChain[i] - j);

- if (j < i || NewDistance > CurDistance)

- continue; // Should not insert.

- }

+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain

+ // functions are currently using an integer type for the vectorized

+ // load/store, and does not support casting between the integer type and a

+ // vector of pointers (e.g. i64 to <2 x i16*>)

+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())

+ continue;

- Tails.push_back(j);

- Heads.push_back(i);

- ConsecutiveChain[i] = j;

- }

+ Value *Ptr = getLoadStorePointerOperand(&I);

+ unsigned AS = Ptr->getType()->getPointerAddressSpace();

+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- bool Changed = false;

- SmallPtrSet<Instruction *, 16> InstructionsProcessed;

+ unsigned VF = VecRegSize / TySize;

+ VectorType *VecTy = dyn_cast<VectorType>(Ty);

- for (int Head : Heads) {

- if (InstructionsProcessed.count(Instrs[Head]))

+ // Only handle power-of-two sized elements.

+ if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||

+ (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))

continue;

- bool LongerChainExists = false;

- for (unsigned TIt = 0; TIt < Tails.size(); TIt++)

- if (Head == Tails[TIt] &&

- !InstructionsProcessed.count(Instrs[Heads[TIt]])) {

- LongerChainExists = true;

- break;

- }

- if (LongerChainExists)

- continue;

- // We found an instr that starts a chain. Now follow the chain and try to

- // vectorize it.

- SmallVector<Instruction *, 16> Operands;

- int I = Head;

- while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {

- if (InstructionsProcessed.count(Instrs[I]))

- break;

- Operands.push_back(Instrs[I]);

- I = ConsecutiveChain[I];

- }

- bool Vectorized = false;

- if (isa<LoadInst>(*Operands.begin()))

- Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);

- else

- Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);

+ // No point in looking at these if they're too big to vectorize.

+ if (TySize > VecRegSize / 2 ||

+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))

+ continue;

- Changed |= Vectorized;

+ Ret[{getUnderlyingObject(Ptr), AS,

+ DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),

+ /*IsLoad=*/LI != nullptr}]

+ .push_back(&I);

}

- return Changed;

+ return Ret;

}

-bool Vectorizer::vectorizeStoreChain(

- ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {

- StoreInst *S0 = cast<StoreInst>(Chain[0]);

- // If the vector has an int element, default to int for the whole store.

- Type *StoreTy = nullptr;

- for (Instruction *I : Chain) {

- StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();

- if (StoreTy->isIntOrIntVectorTy())

- break;

- if (StoreTy->isPtrOrPtrVectorTy()) {

- StoreTy = Type::getIntNTy(F.getParent()->getContext(),

- DL.getTypeSizeInBits(StoreTy));

- break;

- }

- assert(StoreTy && "Failed to find store type");

+std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {

+ if (Instrs.empty())

+ return {};

- unsigned Sz = DL.getTypeSizeInBits(StoreTy);

- unsigned AS = S0->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / Sz;

- unsigned ChainSize = Chain.size();

- Align Alignment = S0->getAlign();

+ unsigned AS = getLoadStoreAddressSpace(Instrs[0]);

+ unsigned ASPtrBits = DL.getIndexSizeInBits(AS);

- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

+#ifndef NDEBUG

+ // Check that Instrs is in BB order and all have the same addr space.

+ for (size_t I = 1; I < Instrs.size(); ++I) {

+ assert(Instrs[I - 1]->comesBefore(Instrs[I]));

+ assert(getLoadStoreAddressSpace(Instrs[I]) == AS);

}

+#endif

- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);

- if (NewChain.empty()) {

- // No vectorization possible.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- if (NewChain.size() == 1) {

- // Failed after the first instruction. Discard it and try the smaller chain.

- InstructionsProcessed->insert(NewChain.front());

- return false;

- }

- // Update Chain to the valid vectorizable subchain.

- Chain = NewChain;

- ChainSize = Chain.size();

- // Check if it's legal to vectorize this chain. If not, split the chain and

- // try again.

- unsigned EltSzInBytes = Sz / 8;

- unsigned SzInBytes = EltSzInBytes * ChainSize;

- FixedVectorType *VecTy;

- auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);

- if (VecStoreTy)

- VecTy = FixedVectorType::get(StoreTy->getScalarType(),

- Chain.size() * VecStoreTy->getNumElements());

- else

- VecTy = FixedVectorType::get(StoreTy, Chain.size());

- // If it's more than the max vector size or the target has a better

- // vector factor, break it into two pieces.

- unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);

- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {

- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."

- " Creating two separate arrays.\n");

- bool Vectorized = false;

- Vectorized |=

- vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);

- Vectorized |=

- vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);

- return Vectorized;

- }

- LLVM_DEBUG({

- dbgs() << "LSV: Stores to vectorize:\n";

- for (Instruction *I : Chain)

- dbgs() << " " << *I << "\n";

- });

- // We won't try again to vectorize the elements of the chain, regardless of

- // whether we succeed below.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- // If the store is going to be misaligned, don't vectorize it.

- unsigned RelativeSpeed;

- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {

- if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {

- unsigned SpeedBefore;

- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);

- if (SpeedBefore > RelativeSpeed)

- return false;

- auto Chains = splitOddVectorElts(Chain, Sz);

- bool Vectorized = false;

- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);

- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);

- return Vectorized;

+ // Machinery to build an MRU-hashtable of Chains.

+ //

+ // (Ideally this could be done with MapVector, but as currently implemented,

+ // moving an element to the front of a MapVector is O(n).)

+ struct InstrListElem : ilist_node<InstrListElem>,

+ std::pair<Instruction *, Chain> {

+ explicit InstrListElem(Instruction *I)

+ : std::pair<Instruction *, Chain>(I, {}) {}

+ };

+ struct InstrListElemDenseMapInfo {

+ using PtrInfo = DenseMapInfo<InstrListElem *>;

+ using IInfo = DenseMapInfo<Instruction *>;

+ static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }

+ static InstrListElem *getTombstoneKey() {

+ return PtrInfo::getTombstoneKey();

}

- Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),

- Align(StackAdjustedAlignment),

- DL, S0, nullptr, &DT);

- if (NewAlign >= Alignment)

- Alignment = NewAlign;

- else

- return false;

- }

- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- bool Vectorized = false;

- Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);

- Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);

- return Vectorized;

- }

- BasicBlock::iterator First, Last;

- std::tie(First, Last) = getBoundaryInstrs(Chain);

- Builder.SetInsertPoint(&*Last);

- Value *Vec = PoisonValue::get(VecTy);

- if (VecStoreTy) {

- unsigned VecWidth = VecStoreTy->getNumElements();

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- StoreInst *Store = cast<StoreInst>(Chain[I]);

- for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {

- unsigned NewIdx = J + I * VecWidth;

- Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),

- Builder.getInt32(J));

- if (Extract->getType() != StoreTy->getScalarType())

- Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());

- Value *Insert =

- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));

- Vec = Insert;

- }

+ static unsigned getHashValue(const InstrListElem *E) {

+ return IInfo::getHashValue(E->first);

}

- } else {

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- StoreInst *Store = cast<StoreInst>(Chain[I]);

- Value *Extract = Store->getValueOperand();

- if (Extract->getType() != StoreTy->getScalarType())

- Extract =

- Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());

- Value *Insert =

- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));

- Vec = Insert;

+ static bool isEqual(const InstrListElem *A, const InstrListElem *B) {

+ if (A == getEmptyKey() || B == getEmptyKey())

+ return A == getEmptyKey() && B == getEmptyKey();

+ if (A == getTombstoneKey() || B == getTombstoneKey())

+ return A == getTombstoneKey() && B == getTombstoneKey();

+ return IInfo::isEqual(A->first, B->first);

}

- }

- StoreInst *SI = Builder.CreateAlignedStore(

- Vec,

- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),

- Alignment);

- propagateMetadata(SI, Chain);

- eraseInstructions(Chain);

- ++NumVectorInstructions;

- NumScalarsVectorized += Chain.size();

- return true;

-bool Vectorizer::vectorizeLoadChain(

- ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {

- LoadInst *L0 = cast<LoadInst>(Chain[0]);

- // If the vector has an int element, default to int for the whole load.

- Type *LoadTy = nullptr;

- for (const auto &V : Chain) {

- LoadTy = cast<LoadInst>(V)->getType();

- if (LoadTy->isIntOrIntVectorTy())

- break;

- if (LoadTy->isPtrOrPtrVectorTy()) {

- LoadTy = Type::getIntNTy(F.getParent()->getContext(),

- DL.getTypeSizeInBits(LoadTy));

- break;

+ };

+ SpecificBumpPtrAllocator<InstrListElem> Allocator;

+ simple_ilist<InstrListElem> MRU;

+ DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;

+ // Compare each instruction in `instrs` to leader of the N most recently-used

+ // chains. This limits the O(n^2) behavior of this pass while also allowing

+ // us to build arbitrarily long chains.

+ for (Instruction *I : Instrs) {

+ constexpr int MaxChainsToTry = 64;

+ bool MatchFound = false;

+ auto ChainIter = MRU.begin();

+ for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();

+ ++J, ++ChainIter) {

+ std::optional<APInt> Offset = getConstantOffset(

+ getLoadStorePointerOperand(ChainIter->first),

+ getLoadStorePointerOperand(I),

+ /*ContextInst=*/

+ (ChainIter->first->comesBefore(I) ? I : ChainIter->first));

+ if (Offset.has_value()) {

+ // `Offset` might not have the expected number of bits, if e.g. AS has a

+ // different number of bits than opaque pointers.

+ ChainIter->second.push_back(ChainElem{I, Offset.value()});

+ // Move ChainIter to the front of the MRU list.

+ MRU.remove(*ChainIter);

+ MRU.push_front(*ChainIter);

+ MatchFound = true;

+ break;

+ }

}

- }

- assert(LoadTy && "Can't determine LoadInst type from chain");

- unsigned Sz = DL.getTypeSizeInBits(LoadTy);

- unsigned AS = L0->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / Sz;

- unsigned ChainSize = Chain.size();

- Align Alignment = L0->getAlign();

- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);

- if (NewChain.empty()) {

- // No vectorization possible.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- if (NewChain.size() == 1) {

- // Failed after the first instruction. Discard it and try the smaller chain.

- InstructionsProcessed->insert(NewChain.front());

- return false;

- }

- // Update Chain to the valid vectorizable subchain.

- Chain = NewChain;

- ChainSize = Chain.size();

- // Check if it's legal to vectorize this chain. If not, split the chain and

- // try again.

- unsigned EltSzInBytes = Sz / 8;

- unsigned SzInBytes = EltSzInBytes * ChainSize;

- VectorType *VecTy;

- auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);

- if (VecLoadTy)

- VecTy = FixedVectorType::get(LoadTy->getScalarType(),

- Chain.size() * VecLoadTy->getNumElements());

- else

- VecTy = FixedVectorType::get(LoadTy, Chain.size());

- // If it's more than the max vector size or the target has a better

- // vector factor, break it into two pieces.

- unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);

- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {

- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."

- " Creating two separate arrays.\n");

- bool Vectorized = false;

- Vectorized |=

- vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);

- Vectorized |=

- vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);

- return Vectorized;

- }

- // We won't try again to vectorize the elements of the chain, regardless of

- // whether we succeed below.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- // If the load is going to be misaligned, don't vectorize it.

- unsigned RelativeSpeed;

- if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {

- if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {

- unsigned SpeedBefore;

- accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);

- if (SpeedBefore > RelativeSpeed)

- return false;

- auto Chains = splitOddVectorElts(Chain, Sz);

- bool Vectorized = false;

- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);

- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);

- return Vectorized;

+ if (!MatchFound) {

+ APInt ZeroOffset(ASPtrBits, 0);

+ InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);

+ E->second.push_back(ChainElem{I, ZeroOffset});

+ MRU.push_front(*E);

+ Chains.insert(E);

}

- Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),

- Align(StackAdjustedAlignment),

- DL, L0, nullptr, &DT);

- if (NewAlign >= Alignment)

- Alignment = NewAlign;

- else

- return false;

}

- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- bool Vectorized = false;

- Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);

- Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);

- return Vectorized;

- }

+ std::vector<Chain> Ret;

+ Ret.reserve(Chains.size());

+ // Iterate over MRU rather than Chains so the order is deterministic.

+ for (auto &E : MRU)

+ if (E.second.size() > 1)

+ Ret.push_back(std::move(E.second));

+ return Ret;

- LLVM_DEBUG({

- dbgs() << "LSV: Loads to vectorize:\n";

- for (Instruction *I : Chain)

- I->dump();

- });

+std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,

+ Instruction *ContextInst,

+ unsigned Depth) {

+ LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA

+ << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst

+ << ", Depth=" << Depth << "\n");

+ // We'll ultimately return a value of this bit width, even if computations

+ // happen in a different width.

+ unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());

+ APInt OffsetA(OrigBitWidth, 0);

+ APInt OffsetB(OrigBitWidth, 0);

+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);

+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);

+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());

+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))

+ return std::nullopt;

- // getVectorizablePrefix already computed getBoundaryInstrs. The value of

- // Last may have changed since then, but the value of First won't have. If it

- // matters, we could compute getBoundaryInstrs only once and reuse it here.

- BasicBlock::iterator First, Last;

- std::tie(First, Last) = getBoundaryInstrs(Chain);

- Builder.SetInsertPoint(&*First);

- Value *Bitcast =

- Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));

- LoadInst *LI =

- Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));

- propagateMetadata(LI, Chain);

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- Value *CV = Chain[I];

- Value *V;

- if (VecLoadTy) {

- // Extract a subvector using shufflevector.

- unsigned VecWidth = VecLoadTy->getNumElements();

- auto Mask =

- llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));

- V = Builder.CreateShuffleVector(LI, Mask, CV->getName());

- } else {

- V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());

- }

+ // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets

+ // should properly handle a possible overflow and the value should fit into

+ // the smallest data type used in the cast/gep chain.

+ assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&

+ OffsetB.getSignificantBits() <= NewPtrBitWidth);

- if (V->getType() != CV->getType()) {

- V = Builder.CreateBitOrPointerCast(V, CV->getType());

+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);

+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);

+ if (PtrA == PtrB)

+ return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);

+ // Try to compute B - A.

+ const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));

+ if (DistScev != SE.getCouldNotCompute()) {

+ LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");

+ ConstantRange DistRange = SE.getSignedRange(DistScev);

+ if (DistRange.isSingleElement()) {

+ // Handle index width (the width of Dist) != pointer width (the width of

+ // the Offset*s at this point).

+ APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);

+ return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);

}

- // Replace the old instruction.

- CV->replaceAllUsesWith(V);

}

- // Since we might have opaque pointers we might end up using the pointer

- // operand of the first load (wrt. memory loaded) for the vector load. Since

- // this first load might not be the first in the block we potentially need to

- // reorder the pointer operand (and its operands). If we have a bitcast though

- // it might be before the load and should be the reorder start instruction.

- // "Might" because for opaque pointers the "bitcast" is just the first loads

- // pointer operand, as oppposed to something we inserted at the right position

- // ourselves.

- Instruction *BCInst = dyn_cast<Instruction>(Bitcast);

- reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);

- eraseInstructions(Chain);

- ++NumVectorInstructions;

- NumScalarsVectorized += Chain.size();

- return true;

-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,

- Align Alignment, unsigned &RelativeSpeed) {

- RelativeSpeed = 0;

- if (Alignment.value() % SzInBytes == 0)

- return false;

- bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),

- SzInBytes * 8, AddressSpace,

- Alignment, &RelativeSpeed);

- LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows

- << " with relative speed = " << RelativeSpeed << '\n';);

- return !Allows || !RelativeSpeed;

+ std::optional<APInt> Diff =

+ getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);

+ if (Diff.has_value())

+ return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))

+ .sextOrTrunc(OrigBitWidth);

+ return std::nullopt;

}