src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-12-20 19:53:05 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-12-20 19:53:05 +0000
commit	0b57cec536236d46e3dba9bd041533462f33dbb7 (patch)
tree	56229dbdbbf76d18580f72f789003db17246c8d9 /contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
parent	718ef55ec7785aae63f98f8ca05dc07ed399c16d (diff)

Notes

Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp')

-rw-r--r--

contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

1248

1 files changed, 0 insertions, 1248 deletions

diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
deleted file mode 100644
index 4273080ddd91..000000000000
--- a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ /dev/null

@@ -1,1248 +0,0 @@

-//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//

-//

-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

-// See https://llvm.org/LICENSE.txt for license information.

-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

-//

-//===----------------------------------------------------------------------===//

-//

-// This pass merges loads/stores to/from sequential memory addresses into vector

-// loads/stores. Although there's nothing GPU-specific in here, this pass is

-// motivated by the microarchitectural quirks of nVidia and AMD GPUs.

-//

-// (For simplicity below we talk about loads only, but everything also applies

-// to stores.)

-//

-// This pass is intended to be run late in the pipeline, after other

-// vectorization opportunities have been exploited. So the assumption here is

-// that immediately following our new vector load we'll need to extract out the

-// individual elements of the load, so we can operate on them individually.

-//

-// On CPUs this transformation is usually not beneficial, because extracting the

-// elements of a vector register is expensive on most architectures. It's

-// usually better just to load each element individually into its own scalar

-// register.

-//

-// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a

-// "vector load" loads directly into a series of scalar registers. In effect,

-// extracting the elements of the vector is free. It's therefore always

-// beneficial to vectorize a sequence of loads on these architectures.

-//

-// Vectorizing (perhaps a better name might be "coalescing") loads can have

-// large performance impacts on GPU kernels, and opportunities for vectorizing

-// are common in GPU code. This pass tries very hard to find such

-// opportunities; its runtime is quadratic in the number of loads in a BB.

-//

-// Some CPU architectures, such as ARM, have instructions that load into

-// multiple scalar registers, similar to a GPU vectorized load. In theory ARM

-// could use this pass (with some modifications), but currently it implements

-// its own pass to do something similar to what we do here.

-#include "llvm/ADT/APInt.h"

-#include "llvm/ADT/ArrayRef.h"

-#include "llvm/ADT/MapVector.h"

-#include "llvm/ADT/PostOrderIterator.h"

-#include "llvm/ADT/STLExtras.h"

-#include "llvm/ADT/SmallPtrSet.h"

-#include "llvm/ADT/SmallVector.h"

-#include "llvm/ADT/Statistic.h"

-#include "llvm/ADT/iterator_range.h"

-#include "llvm/Analysis/AliasAnalysis.h"

-#include "llvm/Analysis/MemoryLocation.h"

-#include "llvm/Analysis/OrderedBasicBlock.h"

-#include "llvm/Analysis/ScalarEvolution.h"

-#include "llvm/Analysis/TargetTransformInfo.h"

-#include "llvm/Transforms/Utils/Local.h"

-#include "llvm/Analysis/ValueTracking.h"

-#include "llvm/Analysis/VectorUtils.h"

-#include "llvm/IR/Attributes.h"

-#include "llvm/IR/BasicBlock.h"

-#include "llvm/IR/Constants.h"

-#include "llvm/IR/DataLayout.h"

-#include "llvm/IR/DerivedTypes.h"

-#include "llvm/IR/Dominators.h"

-#include "llvm/IR/Function.h"

-#include "llvm/IR/IRBuilder.h"

-#include "llvm/IR/InstrTypes.h"

-#include "llvm/IR/Instruction.h"

-#include "llvm/IR/Instructions.h"

-#include "llvm/IR/IntrinsicInst.h"

-#include "llvm/IR/Module.h"

-#include "llvm/IR/Type.h"

-#include "llvm/IR/User.h"

-#include "llvm/IR/Value.h"

-#include "llvm/Pass.h"

-#include "llvm/Support/Casting.h"

-#include "llvm/Support/Debug.h"

-#include "llvm/Support/KnownBits.h"

-#include "llvm/Support/MathExtras.h"

-#include "llvm/Support/raw_ostream.h"

-#include "llvm/Transforms/Vectorize.h"

-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"

-#include <algorithm>

-#include <cassert>

-#include <cstdlib>

-#include <tuple>

-#include <utility>

-using namespace llvm;

-#define DEBUG_TYPE "load-store-vectorizer"

-STATISTIC(NumVectorInstructions, "Number of vector accesses generated");

-STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");

-// FIXME: Assuming stack alignment of 4 is always good enough

-static const unsigned StackAdjustedAlignment = 4;

-namespace {

-/// ChainID is an arbitrary token that is allowed to be different only for the

-/// accesses that are guaranteed to be considered non-consecutive by

-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions

-/// together and reducing the number of instructions the main search operates on

-/// at a time, i.e. this is to reduce compile time and nothing else as the main

-/// search has O(n^2) time complexity. The underlying type of ChainID should not

-/// be relied upon.

-using ChainID = const Value *;

-using InstrList = SmallVector<Instruction *, 8>;

-using InstrListMap = MapVector<ChainID, InstrList>;

-class Vectorizer {

- Function &F;

- AliasAnalysis &AA;

- DominatorTree &DT;

- ScalarEvolution &SE;

- TargetTransformInfo &TTI;

- const DataLayout &DL;

- IRBuilder<> Builder;

-public:

- Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,

- ScalarEvolution &SE, TargetTransformInfo &TTI)

- : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),

- DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}

- bool run();

-private:

- unsigned getPointerAddressSpace(Value *I);

- unsigned getAlignment(LoadInst *LI) const {

- unsigned Align = LI->getAlignment();

- if (Align != 0)

- return Align;

- return DL.getABITypeAlignment(LI->getType());

- }

- unsigned getAlignment(StoreInst *SI) const {

- unsigned Align = SI->getAlignment();

- if (Align != 0)

- return Align;

- return DL.getABITypeAlignment(SI->getValueOperand()->getType());

- }

- static const unsigned MaxDepth = 3;

- bool isConsecutiveAccess(Value *A, Value *B);

- bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,

- unsigned Depth = 0) const;

- bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,

- unsigned Depth) const;

- bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,

- unsigned Depth) const;

- /// After vectorization, reorder the instructions that I depends on

- /// (the instructions defining its operands), to ensure they dominate I.

- void reorder(Instruction *I);

- /// Returns the first and the last instructions in Chain.

- std::pair<BasicBlock::iterator, BasicBlock::iterator>

- getBoundaryInstrs(ArrayRef<Instruction *> Chain);

- /// Erases the original instructions after vectorizing.

- void eraseInstructions(ArrayRef<Instruction *> Chain);

- /// "Legalize" the vector type that would be produced by combining \p

- /// ElementSizeBits elements in \p Chain. Break into two pieces such that the

- /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is

- /// expected to have more than 4 elements.

- std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>

- splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);

- /// Finds the largest prefix of Chain that's vectorizable, checking for

- /// intervening instructions which may affect the memory accessed by the

- /// instructions within Chain.

- ///

- /// The elements of \p Chain must be all loads or all stores and must be in

- /// address order.

- ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);

- /// Collects load and store instructions to vectorize.

- std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);

- /// Processes the collected instructions, the \p Map. The values of \p Map

- /// should be all loads or all stores.

- bool vectorizeChains(InstrListMap &Map);

- /// Finds the load/stores to consecutive memory addresses and vectorizes them.

- bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);

- /// Vectorizes the load instructions in Chain.

- bool

- vectorizeLoadChain(ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);

- /// Vectorizes the store instructions in Chain.

- bool

- vectorizeStoreChain(ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);

- /// Check if this load/store access is misaligned accesses.

- bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,

- unsigned Alignment);

-};

-class LoadStoreVectorizerLegacyPass : public FunctionPass {

-public:

- static char ID;

- LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {

- initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());

- }

- bool runOnFunction(Function &F) override;

- StringRef getPassName() const override {

- return "GPU Load and Store Vectorizer";

- }

- void getAnalysisUsage(AnalysisUsage &AU) const override {

- AU.addRequired<AAResultsWrapperPass>();

- AU.addRequired<ScalarEvolutionWrapperPass>();

- AU.addRequired<DominatorTreeWrapperPass>();

- AU.addRequired<TargetTransformInfoWrapperPass>();

- AU.setPreservesCFG();

- }

-};

-} // end anonymous namespace

-char LoadStoreVectorizerLegacyPass::ID = 0;

-INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,

- "Vectorize load and Store instructions", false, false)

-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)

-INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,

- "Vectorize load and store instructions", false, false)

-Pass *llvm::createLoadStoreVectorizerPass() {

- return new LoadStoreVectorizerLegacyPass();

-bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {

- // Don't vectorize when the attribute NoImplicitFloat is used.

- if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))

- return false;

- AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();

- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();

- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();

- TargetTransformInfo &TTI =

- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

- Vectorizer V(F, AA, DT, SE, TTI);

- return V.run();

-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

- // Don't vectorize when the attribute NoImplicitFloat is used.

- if (F.hasFnAttribute(Attribute::NoImplicitFloat))

- return PreservedAnalyses::all();

- AliasAnalysis &AA = AM.getResult<AAManager>(F);

- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);

- ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);

- TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);

- Vectorizer V(F, AA, DT, SE, TTI);

- bool Changed = V.run();

- PreservedAnalyses PA;

- PA.preserveSet<CFGAnalyses>();

- return Changed ? PA : PreservedAnalyses::all();

-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in

-// vectors of Instructions.

-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {

- SmallVector<Value *, 8> VL(IL.begin(), IL.end());

- propagateMetadata(I, VL);

-// Vectorizer Implementation

-bool Vectorizer::run() {

- bool Changed = false;

- // Scan the blocks in the function in post order.

- for (BasicBlock *BB : post_order(&F)) {

- InstrListMap LoadRefs, StoreRefs;

- std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);

- Changed |= vectorizeChains(LoadRefs);

- Changed |= vectorizeChains(StoreRefs);

- }

- return Changed;

-unsigned Vectorizer::getPointerAddressSpace(Value *I) {

- if (LoadInst *L = dyn_cast<LoadInst>(I))

- return L->getPointerAddressSpace();

- if (StoreInst *S = dyn_cast<StoreInst>(I))

- return S->getPointerAddressSpace();

- return -1;

-// FIXME: Merge with llvm::isConsecutiveAccess

-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {

- Value *PtrA = getLoadStorePointerOperand(A);

- Value *PtrB = getLoadStorePointerOperand(B);

- unsigned ASA = getPointerAddressSpace(A);

- unsigned ASB = getPointerAddressSpace(B);

- // Check that the address spaces match and that the pointers are valid.

- if (!PtrA || !PtrB || (ASA != ASB))

- return false;

- // Make sure that A and B are different pointers of the same size type.

- Type *PtrATy = PtrA->getType()->getPointerElementType();

- Type *PtrBTy = PtrB->getType()->getPointerElementType();

- if (PtrA == PtrB ||

- PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||

- DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||

- DL.getTypeStoreSize(PtrATy->getScalarType()) !=

- DL.getTypeStoreSize(PtrBTy->getScalarType()))

- return false;

- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);

- APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));

- return areConsecutivePointers(PtrA, PtrB, Size);

-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,

- const APInt &PtrDelta,

- unsigned Depth) const {

- unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());

- APInt OffsetA(PtrBitWidth, 0);

- APInt OffsetB(PtrBitWidth, 0);

- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);

- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);

- APInt OffsetDelta = OffsetB - OffsetA;

- // Check if they are based on the same pointer. That makes the offsets

- // sufficient.

- if (PtrA == PtrB)

- return OffsetDelta == PtrDelta;

- // Compute the necessary base pointer delta to have the necessary final delta

- // equal to the pointer delta requested.

- APInt BaseDelta = PtrDelta - OffsetDelta;

- // Compute the distance with SCEV between the base pointers.

- const SCEV *PtrSCEVA = SE.getSCEV(PtrA);

- const SCEV *PtrSCEVB = SE.getSCEV(PtrB);

- const SCEV *C = SE.getConstant(BaseDelta);

- const SCEV *X = SE.getAddExpr(PtrSCEVA, C);

- if (X == PtrSCEVB)

- return true;

- // The above check will not catch the cases where one of the pointers is

- // factorized but the other one is not, such as (C + (S * (A + B))) vs

- // (AS + BS). Get the minus scev. That will allow re-combining the expresions

- // and getting the simplified difference.

- const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);

- if (C == Dist)

- return true;

- // Sometimes even this doesn't work, because SCEV can't always see through

- // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking

- // things the hard way.

- return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);

-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,

- APInt PtrDelta,

- unsigned Depth) const {

- auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);

- auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);

- if (!GEPA || !GEPB)

- return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);

- // Look through GEPs after checking they're the same except for the last

- // index.

- if (GEPA->getNumOperands() != GEPB->getNumOperands() ||

- GEPA->getPointerOperand() != GEPB->getPointerOperand())

- return false;

- gep_type_iterator GTIA = gep_type_begin(GEPA);

- gep_type_iterator GTIB = gep_type_begin(GEPB);

- for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {

- if (GTIA.getOperand() != GTIB.getOperand())

- return false;

- ++GTIA;

- ++GTIB;

- }

- Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());

- Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());

- if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||

- OpA->getType() != OpB->getType())

- return false;

- if (PtrDelta.isNegative()) {

- if (PtrDelta.isMinSignedValue())

- return false;

- PtrDelta.negate();

- std::swap(OpA, OpB);

- }

- uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());

- if (PtrDelta.urem(Stride) != 0)

- return false;

- unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();

- APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);

- // Only look through a ZExt/SExt.

- if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))

- return false;

- bool Signed = isa<SExtInst>(OpA);

- // At this point A could be a function parameter, i.e. not an instruction

- Value *ValA = OpA->getOperand(0);

- OpB = dyn_cast<Instruction>(OpB->getOperand(0));

- if (!OpB || ValA->getType() != OpB->getType())

- return false;

- // Now we need to prove that adding IdxDiff to ValA won't overflow.

- bool Safe = false;

- // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to

- // ValA, we're okay.

- if (OpB->getOpcode() == Instruction::Add &&

- isa<ConstantInt>(OpB->getOperand(1)) &&

- IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {

- if (Signed)

- Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();

- else

- Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();

- }

- unsigned BitWidth = ValA->getType()->getScalarSizeInBits();

- // Second attempt:

- // If all set bits of IdxDiff or any higher order bit other than the sign bit

- // are known to be zero in ValA, we can add Diff to it while guaranteeing no

- // overflow of any sort.

- if (!Safe) {

- OpA = dyn_cast<Instruction>(ValA);

- if (!OpA)

- return false;

- KnownBits Known(BitWidth);

- computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);

- APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());

- if (Signed)

- BitsAllowedToBeSet.clearBit(BitWidth - 1);

- if (BitsAllowedToBeSet.ult(IdxDiff))

- return false;

- }

- const SCEV *OffsetSCEVA = SE.getSCEV(ValA);

- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);

- const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));

- const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);

- return X == OffsetSCEVB;

-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,

- const APInt &PtrDelta,

- unsigned Depth) const {

- if (Depth++ == MaxDepth)

- return false;

- if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {

- if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {

- return SelectA->getCondition() == SelectB->getCondition() &&

- areConsecutivePointers(SelectA->getTrueValue(),

- SelectB->getTrueValue(), PtrDelta, Depth) &&

- areConsecutivePointers(SelectA->getFalseValue(),

- SelectB->getFalseValue(), PtrDelta, Depth);

- }

- return false;

-void Vectorizer::reorder(Instruction *I) {

- OrderedBasicBlock OBB(I->getParent());

- SmallPtrSet<Instruction *, 16> InstructionsToMove;

- SmallVector<Instruction *, 16> Worklist;

- Worklist.push_back(I);

- while (!Worklist.empty()) {

- Instruction *IW = Worklist.pop_back_val();

- int NumOperands = IW->getNumOperands();

- for (int i = 0; i < NumOperands; i++) {

- Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));

- if (!IM || IM->getOpcode() == Instruction::PHI)

- continue;

- // If IM is in another BB, no need to move it, because this pass only

- // vectorizes instructions within one BB.

- if (IM->getParent() != I->getParent())

- continue;

- if (!OBB.dominates(IM, I)) {

- InstructionsToMove.insert(IM);

- Worklist.push_back(IM);

- }

- // All instructions to move should follow I. Start from I, not from begin().

- for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;

- ++BBI) {

- if (!InstructionsToMove.count(&*BBI))

- continue;

- Instruction *IM = &*BBI;

- --BBI;

- IM->removeFromParent();

- IM->insertBefore(I);

- }

-std::pair<BasicBlock::iterator, BasicBlock::iterator>

-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {

- Instruction *C0 = Chain[0];

- BasicBlock::iterator FirstInstr = C0->getIterator();

- BasicBlock::iterator LastInstr = C0->getIterator();

- BasicBlock *BB = C0->getParent();

- unsigned NumFound = 0;

- for (Instruction &I : *BB) {

- if (!is_contained(Chain, &I))

- continue;

- ++NumFound;

- if (NumFound == 1) {

- FirstInstr = I.getIterator();

- }

- if (NumFound == Chain.size()) {

- LastInstr = I.getIterator();

- break;

- }

- // Range is [first, last).

- return std::make_pair(FirstInstr, ++LastInstr);

-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {

- SmallVector<Instruction *, 16> Instrs;

- for (Instruction *I : Chain) {

- Value *PtrOperand = getLoadStorePointerOperand(I);

- assert(PtrOperand && "Instruction must have a pointer operand.");

- Instrs.push_back(I);

- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))

- Instrs.push_back(GEP);

- }

- // Erase instructions.

- for (Instruction *I : Instrs)

- if (I->use_empty())

- I->eraseFromParent();

-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>

-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,

- unsigned ElementSizeBits) {

- unsigned ElementSizeBytes = ElementSizeBits / 8;

- unsigned SizeBytes = ElementSizeBytes * Chain.size();

- unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;

- if (NumLeft == Chain.size()) {

- if ((NumLeft & 1) == 0)

- NumLeft /= 2; // Split even in half

- else

- --NumLeft; // Split off last element

- } else if (NumLeft == 0)

- NumLeft = 1;

- return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));

-ArrayRef<Instruction *>

-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {

- // These are in BB order, unlike Chain, which is in address order.

- SmallVector<Instruction *, 16> MemoryInstrs;

- SmallVector<Instruction *, 16> ChainInstrs;

- bool IsLoadChain = isa<LoadInst>(Chain[0]);

- LLVM_DEBUG({

- for (Instruction *I : Chain) {

- if (IsLoadChain)

- assert(isa<LoadInst>(I) &&

- "All elements of Chain must be loads, or all must be stores.");

- else

- assert(isa<StoreInst>(I) &&

- "All elements of Chain must be loads, or all must be stores.");

- }

- });

- for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {

- if (isa<LoadInst>(I) || isa<StoreInst>(I)) {

- if (!is_contained(Chain, &I))

- MemoryInstrs.push_back(&I);

- else

- ChainInstrs.push_back(&I);

- } else if (isa<IntrinsicInst>(&I) &&

- cast<IntrinsicInst>(&I)->getIntrinsicID() ==

- Intrinsic::sideeffect) {

- // Ignore llvm.sideeffect calls.

- } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {

- LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I

- << '\n');

- break;

- } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {

- LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I

- << '\n');

- break;

- }

- OrderedBasicBlock OBB(Chain[0]->getParent());

- // Loop until we find an instruction in ChainInstrs that we can't vectorize.

- unsigned ChainInstrIdx = 0;

- Instruction *BarrierMemoryInstr = nullptr;

- for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {

- Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];

- // If a barrier memory instruction was found, chain instructions that follow

- // will not be added to the valid prefix.

- if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))

- break;

- // Check (in BB order) if any instruction prevents ChainInstr from being

- // vectorized. Find and store the first such "conflicting" instruction.

- for (Instruction *MemInstr : MemoryInstrs) {

- // If a barrier memory instruction was found, do not check past it.

- if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))

- break;

- auto *MemLoad = dyn_cast<LoadInst>(MemInstr);

- auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);

- if (MemLoad && ChainLoad)

- continue;

- // We can ignore the alias if the we have a load store pair and the load

- // is known to be invariant. The load cannot be clobbered by the store.

- auto IsInvariantLoad = [](const LoadInst *LI) -> bool {

- return LI->getMetadata(LLVMContext::MD_invariant_load);

- };

- // We can ignore the alias as long as the load comes before the store,

- // because that means we won't be moving the load past the store to

- // vectorize it (the vectorized load is inserted at the location of the

- // first load in the chain).

- if (isa<StoreInst>(MemInstr) && ChainLoad &&

- (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))

- continue;

- // Same case, but in reverse.

- if (MemLoad && isa<StoreInst>(ChainInstr) &&

- (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))

- continue;

- if (!AA.isNoAlias(MemoryLocation::get(MemInstr),

- MemoryLocation::get(ChainInstr))) {

- LLVM_DEBUG({

- dbgs() << "LSV: Found alias:\n"

- " Aliasing instruction and pointer:\n"

- << " " << *MemInstr << '\n'

- << " " << *getLoadStorePointerOperand(MemInstr) << '\n'

- << " Aliased instruction and pointer:\n"

- << " " << *ChainInstr << '\n'

- << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';

- });

- // Save this aliasing memory instruction as a barrier, but allow other

- // instructions that precede the barrier to be vectorized with this one.

- BarrierMemoryInstr = MemInstr;

- break;

- }

- // Continue the search only for store chains, since vectorizing stores that

- // precede an aliasing load is valid. Conversely, vectorizing loads is valid

- // up to an aliasing store, but should not pull loads from further down in

- // the basic block.

- if (IsLoadChain && BarrierMemoryInstr) {

- // The BarrierMemoryInstr is a store that precedes ChainInstr.

- assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));

- break;

- }

- // Find the largest prefix of Chain whose elements are all in

- // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of

- // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB

- // order.)

- SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(

- ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);

- unsigned ChainIdx = 0;

- for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {

- if (!VectorizableChainInstrs.count(Chain[ChainIdx]))

- break;

- }

- return Chain.slice(0, ChainIdx);

-static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {

- const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);

- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {

- // The select's themselves are distinct instructions even if they share the

- // same condition and evaluate to consecutive pointers for true and false

- // values of the condition. Therefore using the select's themselves for

- // grouping instructions would put consecutive accesses into different lists

- // and they won't be even checked for being consecutive, and won't be

- // vectorized.

- return Sel->getCondition();

- }

- return ObjPtr;

-std::pair<InstrListMap, InstrListMap>

-Vectorizer::collectInstructions(BasicBlock *BB) {

- InstrListMap LoadRefs;

- InstrListMap StoreRefs;

- for (Instruction &I : *BB) {

- if (!I.mayReadOrWriteMemory())

- continue;

- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {

- if (!LI->isSimple())

- continue;

- // Skip if it's not legal.

- if (!TTI.isLegalToVectorizeLoad(LI))

- continue;

- Type *Ty = LI->getType();

- if (!VectorType::isValidElementType(Ty->getScalarType()))

- continue;

- // Skip weird non-byte sizes. They probably aren't worth the effort of

- // handling correctly.

- unsigned TySize = DL.getTypeSizeInBits(Ty);

- if ((TySize % 8) != 0)

- continue;

- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain

- // functions are currently using an integer type for the vectorized

- // load/store, and does not support casting between the integer type and a

- // vector of pointers (e.g. i64 to <2 x i16*>)

- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())

- continue;

- Value *Ptr = LI->getPointerOperand();

- unsigned AS = Ptr->getType()->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / TySize;

- VectorType *VecTy = dyn_cast<VectorType>(Ty);

- // No point in looking at these if they're too big to vectorize.

- if (TySize > VecRegSize / 2 ||

- (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))

- continue;

- // Make sure all the users of a vector are constant-index extracts.

- if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {

- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);

- return EEI && isa<ConstantInt>(EEI->getOperand(1));

- }))

- continue;

- // Save the load locations.

- const ChainID ID = getChainID(Ptr, DL);

- LoadRefs[ID].push_back(LI);

- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {

- if (!SI->isSimple())

- continue;

- // Skip if it's not legal.

- if (!TTI.isLegalToVectorizeStore(SI))

- continue;

- Type *Ty = SI->getValueOperand()->getType();

- if (!VectorType::isValidElementType(Ty->getScalarType()))

- continue;

- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain

- // functions are currently using an integer type for the vectorized

- // load/store, and does not support casting between the integer type and a

- // vector of pointers (e.g. i64 to <2 x i16*>)

- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())

- continue;

- // Skip weird non-byte sizes. They probably aren't worth the effort of

- // handling correctly.

- unsigned TySize = DL.getTypeSizeInBits(Ty);

- if ((TySize % 8) != 0)

- continue;

- Value *Ptr = SI->getPointerOperand();

- unsigned AS = Ptr->getType()->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / TySize;

- VectorType *VecTy = dyn_cast<VectorType>(Ty);

- // No point in looking at these if they're too big to vectorize.

- if (TySize > VecRegSize / 2 ||

- (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))

- continue;

- if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {

- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);

- return EEI && isa<ConstantInt>(EEI->getOperand(1));

- }))

- continue;

- // Save store location.

- const ChainID ID = getChainID(Ptr, DL);

- StoreRefs[ID].push_back(SI);

- }

- return {LoadRefs, StoreRefs};

-bool Vectorizer::vectorizeChains(InstrListMap &Map) {

- bool Changed = false;

- for (const std::pair<ChainID, InstrList> &Chain : Map) {

- unsigned Size = Chain.second.size();

- if (Size < 2)

- continue;

- LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");

- // Process the stores in chunks of 64.

- for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {

- unsigned Len = std::min<unsigned>(CE - CI, 64);

- ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);

- Changed |= vectorizeInstructions(Chunk);

- }

- return Changed;

-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {

- LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()

- << " instructions.\n");

- SmallVector<int, 16> Heads, Tails;

- int ConsecutiveChain[64];

- // Do a quadratic search on all of the given loads/stores and find all of the

- // pairs of loads/stores that follow each other.

- for (int i = 0, e = Instrs.size(); i < e; ++i) {

- ConsecutiveChain[i] = -1;

- for (int j = e - 1; j >= 0; --j) {

- if (i == j)

- continue;

- if (isConsecutiveAccess(Instrs[i], Instrs[j])) {

- if (ConsecutiveChain[i] != -1) {

- int CurDistance = std::abs(ConsecutiveChain[i] - i);

- int NewDistance = std::abs(ConsecutiveChain[i] - j);

- if (j < i || NewDistance > CurDistance)

- continue; // Should not insert.

- }

- Tails.push_back(j);

- Heads.push_back(i);

- ConsecutiveChain[i] = j;

- }

- bool Changed = false;

- SmallPtrSet<Instruction *, 16> InstructionsProcessed;

- for (int Head : Heads) {

- if (InstructionsProcessed.count(Instrs[Head]))

- continue;

- bool LongerChainExists = false;

- for (unsigned TIt = 0; TIt < Tails.size(); TIt++)

- if (Head == Tails[TIt] &&

- !InstructionsProcessed.count(Instrs[Heads[TIt]])) {

- LongerChainExists = true;

- break;

- }

- if (LongerChainExists)

- continue;

- // We found an instr that starts a chain. Now follow the chain and try to

- // vectorize it.

- SmallVector<Instruction *, 16> Operands;

- int I = Head;

- while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {

- if (InstructionsProcessed.count(Instrs[I]))

- break;

- Operands.push_back(Instrs[I]);

- I = ConsecutiveChain[I];

- }

- bool Vectorized = false;

- if (isa<LoadInst>(*Operands.begin()))

- Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);

- else

- Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);

- Changed |= Vectorized;

- }

- return Changed;

-bool Vectorizer::vectorizeStoreChain(

- ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {

- StoreInst *S0 = cast<StoreInst>(Chain[0]);

- // If the vector has an int element, default to int for the whole store.

- Type *StoreTy = nullptr;

- for (Instruction *I : Chain) {

- StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();

- if (StoreTy->isIntOrIntVectorTy())

- break;

- if (StoreTy->isPtrOrPtrVectorTy()) {

- StoreTy = Type::getIntNTy(F.getParent()->getContext(),

- DL.getTypeSizeInBits(StoreTy));

- break;

- }

- assert(StoreTy && "Failed to find store type");

- unsigned Sz = DL.getTypeSizeInBits(StoreTy);

- unsigned AS = S0->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / Sz;

- unsigned ChainSize = Chain.size();

- unsigned Alignment = getAlignment(S0);

- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);

- if (NewChain.empty()) {

- // No vectorization possible.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- if (NewChain.size() == 1) {

- // Failed after the first instruction. Discard it and try the smaller chain.

- InstructionsProcessed->insert(NewChain.front());

- return false;

- }

- // Update Chain to the valid vectorizable subchain.

- Chain = NewChain;

- ChainSize = Chain.size();

- // Check if it's legal to vectorize this chain. If not, split the chain and

- // try again.

- unsigned EltSzInBytes = Sz / 8;

- unsigned SzInBytes = EltSzInBytes * ChainSize;

- VectorType *VecTy;

- VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);

- if (VecStoreTy)

- VecTy = VectorType::get(StoreTy->getScalarType(),

- Chain.size() * VecStoreTy->getNumElements());

- else

- VecTy = VectorType::get(StoreTy, Chain.size());

- // If it's more than the max vector size or the target has a better

- // vector factor, break it into two pieces.

- unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);

- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {

- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."

- " Creating two separate arrays.\n");

- return vectorizeStoreChain(Chain.slice(0, TargetVF),

- InstructionsProcessed) |

- vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);

- }

- LLVM_DEBUG({

- dbgs() << "LSV: Stores to vectorize:\n";

- for (Instruction *I : Chain)

- dbgs() << " " << *I << "\n";

- });

- // We won't try again to vectorize the elements of the chain, regardless of

- // whether we succeed below.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- // If the store is going to be misaligned, don't vectorize it.

- if (accessIsMisaligned(SzInBytes, AS, Alignment)) {

- if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- return vectorizeStoreChain(Chains.first, InstructionsProcessed) |

- vectorizeStoreChain(Chains.second, InstructionsProcessed);

- }

- unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),

- StackAdjustedAlignment,

- DL, S0, nullptr, &DT);

- if (NewAlign != 0)

- Alignment = NewAlign;

- }

- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- return vectorizeStoreChain(Chains.first, InstructionsProcessed) |

- vectorizeStoreChain(Chains.second, InstructionsProcessed);

- }

- BasicBlock::iterator First, Last;

- std::tie(First, Last) = getBoundaryInstrs(Chain);

- Builder.SetInsertPoint(&*Last);

- Value *Vec = UndefValue::get(VecTy);

- if (VecStoreTy) {

- unsigned VecWidth = VecStoreTy->getNumElements();

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- StoreInst *Store = cast<StoreInst>(Chain[I]);

- for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {

- unsigned NewIdx = J + I * VecWidth;

- Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),

- Builder.getInt32(J));

- if (Extract->getType() != StoreTy->getScalarType())

- Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());

- Value *Insert =

- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));

- Vec = Insert;

- }

- } else {

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- StoreInst *Store = cast<StoreInst>(Chain[I]);

- Value *Extract = Store->getValueOperand();

- if (Extract->getType() != StoreTy->getScalarType())

- Extract =

- Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());

- Value *Insert =

- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));

- Vec = Insert;

- }

- StoreInst *SI = Builder.CreateAlignedStore(

- Vec,

- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),

- Alignment);

- propagateMetadata(SI, Chain);

- eraseInstructions(Chain);

- ++NumVectorInstructions;

- NumScalarsVectorized += Chain.size();

- return true;

-bool Vectorizer::vectorizeLoadChain(

- ArrayRef<Instruction *> Chain,

- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {

- LoadInst *L0 = cast<LoadInst>(Chain[0]);

- // If the vector has an int element, default to int for the whole load.

- Type *LoadTy;

- for (const auto &V : Chain) {

- LoadTy = cast<LoadInst>(V)->getType();

- if (LoadTy->isIntOrIntVectorTy())

- break;

- if (LoadTy->isPtrOrPtrVectorTy()) {

- LoadTy = Type::getIntNTy(F.getParent()->getContext(),

- DL.getTypeSizeInBits(LoadTy));

- break;

- }

- unsigned Sz = DL.getTypeSizeInBits(LoadTy);

- unsigned AS = L0->getPointerAddressSpace();

- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

- unsigned VF = VecRegSize / Sz;

- unsigned ChainSize = Chain.size();

- unsigned Alignment = getAlignment(L0);

- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);

- if (NewChain.empty()) {

- // No vectorization possible.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- return false;

- }

- if (NewChain.size() == 1) {

- // Failed after the first instruction. Discard it and try the smaller chain.

- InstructionsProcessed->insert(NewChain.front());

- return false;

- }

- // Update Chain to the valid vectorizable subchain.

- Chain = NewChain;

- ChainSize = Chain.size();

- // Check if it's legal to vectorize this chain. If not, split the chain and

- // try again.

- unsigned EltSzInBytes = Sz / 8;

- unsigned SzInBytes = EltSzInBytes * ChainSize;

- VectorType *VecTy;

- VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);

- if (VecLoadTy)

- VecTy = VectorType::get(LoadTy->getScalarType(),

- Chain.size() * VecLoadTy->getNumElements());

- else

- VecTy = VectorType::get(LoadTy, Chain.size());

- // If it's more than the max vector size or the target has a better

- // vector factor, break it into two pieces.

- unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);

- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {

- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."

- " Creating two separate arrays.\n");

- return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |

- vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);

- }

- // We won't try again to vectorize the elements of the chain, regardless of

- // whether we succeed below.

- InstructionsProcessed->insert(Chain.begin(), Chain.end());

- // If the load is going to be misaligned, don't vectorize it.

- if (accessIsMisaligned(SzInBytes, AS, Alignment)) {

- if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- return vectorizeLoadChain(Chains.first, InstructionsProcessed) |

- vectorizeLoadChain(Chains.second, InstructionsProcessed);

- }

- Alignment = getOrEnforceKnownAlignment(

- L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);

- }

- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {

- auto Chains = splitOddVectorElts(Chain, Sz);

- return vectorizeLoadChain(Chains.first, InstructionsProcessed) |

- vectorizeLoadChain(Chains.second, InstructionsProcessed);

- }

- LLVM_DEBUG({

- dbgs() << "LSV: Loads to vectorize:\n";

- for (Instruction *I : Chain)

- I->dump();

- });

- // getVectorizablePrefix already computed getBoundaryInstrs. The value of

- // Last may have changed since then, but the value of First won't have. If it

- // matters, we could compute getBoundaryInstrs only once and reuse it here.

- BasicBlock::iterator First, Last;

- std::tie(First, Last) = getBoundaryInstrs(Chain);

- Builder.SetInsertPoint(&*First);

- Value *Bitcast =

- Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));

- LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);

- propagateMetadata(LI, Chain);

- if (VecLoadTy) {

- SmallVector<Instruction *, 16> InstrsToErase;

- unsigned VecWidth = VecLoadTy->getNumElements();

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- for (auto Use : Chain[I]->users()) {

- // All users of vector loads are ExtractElement instructions with

- // constant indices, otherwise we would have bailed before now.

- Instruction *UI = cast<Instruction>(Use);

- unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();

- unsigned NewIdx = Idx + I * VecWidth;

- Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),

- UI->getName());

- if (V->getType() != UI->getType())

- V = Builder.CreateBitCast(V, UI->getType());

- // Replace the old instruction.

- UI->replaceAllUsesWith(V);

- InstrsToErase.push_back(UI);

- }

- // Bitcast might not be an Instruction, if the value being loaded is a

- // constant. In that case, no need to reorder anything.

- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))

- reorder(BitcastInst);

- for (auto I : InstrsToErase)

- I->eraseFromParent();

- } else {

- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {

- Value *CV = Chain[I];

- Value *V =

- Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());

- if (V->getType() != CV->getType()) {

- V = Builder.CreateBitOrPointerCast(V, CV->getType());

- }

- // Replace the old instruction.

- CV->replaceAllUsesWith(V);

- }

- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))

- reorder(BitcastInst);

- }

- eraseInstructions(Chain);

- ++NumVectorInstructions;

- NumScalarsVectorized += Chain.size();

- return true;

-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,

- unsigned Alignment) {

- if (Alignment % SzInBytes == 0)

- return false;

- bool Fast = false;

- bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),

- SzInBytes * 8, AddressSpace,

- Alignment, &Fast);

- LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows

- << " and fast? " << Fast << "\n";);

- return !Allows || !Fast;