summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp1264
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp1241
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h287
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7914
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp7147
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h126
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp766
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h1692
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h40
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp354
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h71
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp84
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h35
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h44
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp248
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.h74
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanSLP.cpp470
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h186
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp132
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.h43
-rw-r--r--llvm/lib/Transforms/Vectorize/Vectorize.cpp42
21 files changed, 22260 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
new file mode 100644
index 000000000000..f44976c723ec
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -0,0 +1,1264 @@
+//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores. Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited. So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures. It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
+// "vector load" loads directly into a series of scalar registers. In effect,
+// extracting the elements of the vector is free. It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code. This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-store-vectorizer"
+
+STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
+STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
+
+namespace {
+
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
+using InstrList = SmallVector<Instruction *, 8>;
+using InstrListMap = MapVector<ChainID, InstrList>;
+
+class Vectorizer {
+ Function &F;
+ AliasAnalysis &AA;
+ DominatorTree &DT;
+ ScalarEvolution &SE;
+ TargetTransformInfo &TTI;
+ const DataLayout &DL;
+ IRBuilder<> Builder;
+
+public:
+ Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
+ ScalarEvolution &SE, TargetTransformInfo &TTI)
+ : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
+ DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+
+ bool run();
+
+private:
+ unsigned getPointerAddressSpace(Value *I);
+
+ unsigned getAlignment(LoadInst *LI) const {
+ unsigned Align = LI->getAlignment();
+ if (Align != 0)
+ return Align;
+
+ return DL.getABITypeAlignment(LI->getType());
+ }
+
+ unsigned getAlignment(StoreInst *SI) const {
+ unsigned Align = SI->getAlignment();
+ if (Align != 0)
+ return Align;
+
+ return DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ }
+
+ static const unsigned MaxDepth = 3;
+
+ bool isConsecutiveAccess(Value *A, Value *B);
+ bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth = 0) const;
+ bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth) const;
+ bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth) const;
+
+ /// After vectorization, reorder the instructions that I depends on
+ /// (the instructions defining its operands), to ensure they dominate I.
+ void reorder(Instruction *I);
+
+ /// Returns the first and the last instructions in Chain.
+ std::pair<BasicBlock::iterator, BasicBlock::iterator>
+ getBoundaryInstrs(ArrayRef<Instruction *> Chain);
+
+ /// Erases the original instructions after vectorizing.
+ void eraseInstructions(ArrayRef<Instruction *> Chain);
+
+ /// "Legalize" the vector type that would be produced by combining \p
+ /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
+ /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
+ /// expected to have more than 4 elements.
+ std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+ splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
+
+ /// Finds the largest prefix of Chain that's vectorizable, checking for
+ /// intervening instructions which may affect the memory accessed by the
+ /// instructions within Chain.
+ ///
+ /// The elements of \p Chain must be all loads or all stores and must be in
+ /// address order.
+ ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
+
+ /// Collects load and store instructions to vectorize.
+ std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
+
+ /// Processes the collected instructions, the \p Map. The values of \p Map
+ /// should be all loads or all stores.
+ bool vectorizeChains(InstrListMap &Map);
+
+ /// Finds the load/stores to consecutive memory addresses and vectorizes them.
+ bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
+
+ /// Vectorizes the load instructions in Chain.
+ bool
+ vectorizeLoadChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+ /// Vectorizes the store instructions in Chain.
+ bool
+ vectorizeStoreChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+ /// Check if this load/store access is misaligned accesses.
+ bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+ unsigned Alignment);
+};
+
+class LoadStoreVectorizerLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
+ initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "GPU Load and Store Vectorizer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+char LoadStoreVectorizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+ "Vectorize load and Store instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+ "Vectorize load and store instructions", false, false)
+
+Pass *llvm::createLoadStoreVectorizerPass() {
+ return new LoadStoreVectorizerLegacyPass();
+}
+
+bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ Vectorizer V(F, AA, DT, SE, TTI);
+ return V.run();
+}
+
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return PreservedAnalyses::all();
+
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+ Vectorizer V(F, AA, DT, SE, TTI);
+ bool Changed = V.run();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return Changed ? PA : PreservedAnalyses::all();
+}
+
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+ SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+ propagateMetadata(I, VL);
+}
+
+// Vectorizer Implementation
+bool Vectorizer::run() {
+ bool Changed = false;
+
+ // Scan the blocks in the function in post order.
+ for (BasicBlock *BB : post_order(&F)) {
+ InstrListMap LoadRefs, StoreRefs;
+ std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
+ Changed |= vectorizeChains(LoadRefs);
+ Changed |= vectorizeChains(StoreRefs);
+ }
+
+ return Changed;
+}
+
+unsigned Vectorizer::getPointerAddressSpace(Value *I) {
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ return L->getPointerAddressSpace();
+ if (StoreInst *S = dyn_cast<StoreInst>(I))
+ return S->getPointerAddressSpace();
+ return -1;
+}
+
+// FIXME: Merge with llvm::isConsecutiveAccess
+bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
+ Value *PtrA = getLoadStorePointerOperand(A);
+ Value *PtrB = getLoadStorePointerOperand(B);
+ unsigned ASA = getPointerAddressSpace(A);
+ unsigned ASB = getPointerAddressSpace(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!PtrA || !PtrB || (ASA != ASB))
+ return false;
+
+ // Make sure that A and B are different pointers of the same size type.
+ Type *PtrATy = PtrA->getType()->getPointerElementType();
+ Type *PtrBTy = PtrB->getType()->getPointerElementType();
+ if (PtrA == PtrB ||
+ PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
+ DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
+ DL.getTypeStoreSize(PtrATy->getScalarType()) !=
+ DL.getTypeStoreSize(PtrBTy->getScalarType()))
+ return false;
+
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+
+ return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+ APInt PtrDelta, unsigned Depth) const {
+ unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(PtrBitWidth, 0);
+ APInt OffsetB(PtrBitWidth, 0);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+
+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ return false;
+
+ // In case if we have to shrink the pointer
+ // stripAndAccumulateInBoundsConstantOffsets should properly handle a
+ // possible overflow and the value should fit into a smallest data type
+ // used in the cast/gep chain.
+ assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
+ OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+
+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+ PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+
+ APInt OffsetDelta = OffsetB - OffsetA;
+
+ // Check if they are based on the same pointer. That makes the offsets
+ // sufficient.
+ if (PtrA == PtrB)
+ return OffsetDelta == PtrDelta;
+
+ // Compute the necessary base pointer delta to have the necessary final delta
+ // equal to the pointer delta requested.
+ APInt BaseDelta = PtrDelta - OffsetDelta;
+
+ // Compute the distance with SCEV between the base pointers.
+ const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+ const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+ const SCEV *C = SE.getConstant(BaseDelta);
+ const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
+ if (X == PtrSCEVB)
+ return true;
+
+ // The above check will not catch the cases where one of the pointers is
+ // factorized but the other one is not, such as (C + (S * (A + B))) vs
+ // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+ // and getting the simplified difference.
+ const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+ if (C == Dist)
+ return true;
+
+ // Sometimes even this doesn't work, because SCEV can't always see through
+ // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
+ // things the hard way.
+ return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+ APInt PtrDelta,
+ unsigned Depth) const {
+ auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+ auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+ if (!GEPA || !GEPB)
+ return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+
+ // Look through GEPs after checking they're the same except for the last
+ // index.
+ if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+ GEPA->getPointerOperand() != GEPB->getPointerOperand())
+ return false;
+ gep_type_iterator GTIA = gep_type_begin(GEPA);
+ gep_type_iterator GTIB = gep_type_begin(GEPB);
+ for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+ if (GTIA.getOperand() != GTIB.getOperand())
+ return false;
+ ++GTIA;
+ ++GTIB;
+ }
+
+ Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+ Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
+ if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
+ OpA->getType() != OpB->getType())
+ return false;
+
+ if (PtrDelta.isNegative()) {
+ if (PtrDelta.isMinSignedValue())
+ return false;
+ PtrDelta.negate();
+ std::swap(OpA, OpB);
+ }
+ uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ if (PtrDelta.urem(Stride) != 0)
+ return false;
+ unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+ APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
+ // Only look through a ZExt/SExt.
+ if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
+ return false;
+
+ bool Signed = isa<SExtInst>(OpA);
+
+ // At this point A could be a function parameter, i.e. not an instruction
+ Value *ValA = OpA->getOperand(0);
+ OpB = dyn_cast<Instruction>(OpB->getOperand(0));
+ if (!OpB || ValA->getType() != OpB->getType())
+ return false;
+
+ // Now we need to prove that adding IdxDiff to ValA won't overflow.
+ bool Safe = false;
+ // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+ // ValA, we're okay.
+ if (OpB->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(OpB->getOperand(1)) &&
+ IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
+ if (Signed)
+ Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
+ else
+ Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
+ }
+
+ unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
+
+ // Second attempt:
+ // If all set bits of IdxDiff or any higher order bit other than the sign bit
+ // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+ // overflow of any sort.
+ if (!Safe) {
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!OpA)
+ return false;
+ KnownBits Known(BitWidth);
+ computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
+ APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+ if (Signed)
+ BitsAllowedToBeSet.clearBit(BitWidth - 1);
+ if (BitsAllowedToBeSet.ult(IdxDiff))
+ return false;
+ }
+
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+ const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+ return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ if (Depth++ == MaxDepth)
+ return false;
+
+ if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+ if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+ return SelectA->getCondition() == SelectB->getCondition() &&
+ areConsecutivePointers(SelectA->getTrueValue(),
+ SelectB->getTrueValue(), PtrDelta, Depth) &&
+ areConsecutivePointers(SelectA->getFalseValue(),
+ SelectB->getFalseValue(), PtrDelta, Depth);
+ }
+ }
+ return false;
+}
+
+void Vectorizer::reorder(Instruction *I) {
+ OrderedBasicBlock OBB(I->getParent());
+ SmallPtrSet<Instruction *, 16> InstructionsToMove;
+ SmallVector<Instruction *, 16> Worklist;
+
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *IW = Worklist.pop_back_val();
+ int NumOperands = IW->getNumOperands();
+ for (int i = 0; i < NumOperands; i++) {
+ Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+ if (!IM || IM->getOpcode() == Instruction::PHI)
+ continue;
+
+ // If IM is in another BB, no need to move it, because this pass only
+ // vectorizes instructions within one BB.
+ if (IM->getParent() != I->getParent())
+ continue;
+
+ if (!OBB.dominates(IM, I)) {
+ InstructionsToMove.insert(IM);
+ Worklist.push_back(IM);
+ }
+ }
+ }
+
+ // All instructions to move should follow I. Start from I, not from begin().
+ for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
+ ++BBI) {
+ if (!InstructionsToMove.count(&*BBI))
+ continue;
+ Instruction *IM = &*BBI;
+ --BBI;
+ IM->removeFromParent();
+ IM->insertBefore(I);
+ }
+}
+
+std::pair<BasicBlock::iterator, BasicBlock::iterator>
+Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
+ Instruction *C0 = Chain[0];
+ BasicBlock::iterator FirstInstr = C0->getIterator();
+ BasicBlock::iterator LastInstr = C0->getIterator();
+
+ BasicBlock *BB = C0->getParent();
+ unsigned NumFound = 0;
+ for (Instruction &I : *BB) {
+ if (!is_contained(Chain, &I))
+ continue;
+
+ ++NumFound;
+ if (NumFound == 1) {
+ FirstInstr = I.getIterator();
+ }
+ if (NumFound == Chain.size()) {
+ LastInstr = I.getIterator();
+ break;
+ }
+ }
+
+ // Range is [first, last).
+ return std::make_pair(FirstInstr, ++LastInstr);
+}
+
+void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
+ SmallVector<Instruction *, 16> Instrs;
+ for (Instruction *I : Chain) {
+ Value *PtrOperand = getLoadStorePointerOperand(I);
+ assert(PtrOperand && "Instruction must have a pointer operand.");
+ Instrs.push_back(I);
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
+ Instrs.push_back(GEP);
+ }
+
+ // Erase instructions.
+ for (Instruction *I : Instrs)
+ if (I->use_empty())
+ I->eraseFromParent();
+}
+
+std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
+ unsigned ElementSizeBits) {
+ unsigned ElementSizeBytes = ElementSizeBits / 8;
+ unsigned SizeBytes = ElementSizeBytes * Chain.size();
+ unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
+ if (NumLeft == Chain.size()) {
+ if ((NumLeft & 1) == 0)
+ NumLeft /= 2; // Split even in half
+ else
+ --NumLeft; // Split off last element
+ } else if (NumLeft == 0)
+ NumLeft = 1;
+ return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
+}
+
+ArrayRef<Instruction *>
+Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
+ // These are in BB order, unlike Chain, which is in address order.
+ SmallVector<Instruction *, 16> MemoryInstrs;
+ SmallVector<Instruction *, 16> ChainInstrs;
+
+ bool IsLoadChain = isa<LoadInst>(Chain[0]);
+ LLVM_DEBUG({
+ for (Instruction *I : Chain) {
+ if (IsLoadChain)
+ assert(isa<LoadInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ else
+ assert(isa<StoreInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ }
+ });
+
+ for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ if (!is_contained(Chain, &I))
+ MemoryInstrs.push_back(&I);
+ else
+ ChainInstrs.push_back(&I);
+ } else if (isa<IntrinsicInst>(&I) &&
+ cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+ Intrinsic::sideeffect) {
+ // Ignore llvm.sideeffect calls.
+ } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
+ LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+ << '\n');
+ break;
+ } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
+ LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+ << '\n');
+ break;
+ }
+ }
+
+ OrderedBasicBlock OBB(Chain[0]->getParent());
+
+ // Loop until we find an instruction in ChainInstrs that we can't vectorize.
+ unsigned ChainInstrIdx = 0;
+ Instruction *BarrierMemoryInstr = nullptr;
+
+ for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
+ Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
+
+ // If a barrier memory instruction was found, chain instructions that follow
+ // will not be added to the valid prefix.
+ if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+ break;
+
+ // Check (in BB order) if any instruction prevents ChainInstr from being
+ // vectorized. Find and store the first such "conflicting" instruction.
+ for (Instruction *MemInstr : MemoryInstrs) {
+ // If a barrier memory instruction was found, do not check past it.
+ if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+ break;
+
+ auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+ auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+ if (MemLoad && ChainLoad)
+ continue;
+
+ // We can ignore the alias if the we have a load store pair and the load
+ // is known to be invariant. The load cannot be clobbered by the store.
+ auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+ return LI->hasMetadata(LLVMContext::MD_invariant_load);
+ };
+
+ // We can ignore the alias as long as the load comes before the store,
+ // because that means we won't be moving the load past the store to
+ // vectorize it (the vectorized load is inserted at the location of the
+ // first load in the chain).
+ if (isa<StoreInst>(MemInstr) && ChainLoad &&
+ (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
+ continue;
+
+ // Same case, but in reverse.
+ if (MemLoad && isa<StoreInst>(ChainInstr) &&
+ (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
+ continue;
+
+ if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
+ MemoryLocation::get(ChainInstr))) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Found alias:\n"
+ " Aliasing instruction and pointer:\n"
+ << " " << *MemInstr << '\n'
+ << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
+ << " Aliased instruction and pointer:\n"
+ << " " << *ChainInstr << '\n'
+ << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
+ });
+ // Save this aliasing memory instruction as a barrier, but allow other
+ // instructions that precede the barrier to be vectorized with this one.
+ BarrierMemoryInstr = MemInstr;
+ break;
+ }
+ }
+ // Continue the search only for store chains, since vectorizing stores that
+ // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+ // up to an aliasing store, but should not pull loads from further down in
+ // the basic block.
+ if (IsLoadChain && BarrierMemoryInstr) {
+ // The BarrierMemoryInstr is a store that precedes ChainInstr.
+ assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
+ break;
+ }
+ }
+
+ // Find the largest prefix of Chain whose elements are all in
+ // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
+ // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
+ // order.)
+ SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
+ ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
+ unsigned ChainIdx = 0;
+ for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
+ if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
+ break;
+ }
+ return Chain.slice(0, ChainIdx);
+}
+
+static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
+ const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share the
+ // same condition and evaluate to consecutive pointers for true and false
+ // values of the condition. Therefore using the select's themselves for
+ // grouping instructions would put consecutive accesses into different lists
+ // and they won't be even checked for being consecutive, and won't be
+ // vectorized.
+ return Sel->getCondition();
+ }
+ return ObjPtr;
+}
+
+std::pair<InstrListMap, InstrListMap>
+Vectorizer::collectInstructions(BasicBlock *BB) {
+ InstrListMap LoadRefs;
+ InstrListMap StoreRefs;
+
+ for (Instruction &I : *BB) {
+ if (!I.mayReadOrWriteMemory())
+ continue;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isSimple())
+ continue;
+
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeLoad(LI))
+ continue;
+
+ Type *Ty = LI->getType();
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
+ continue;
+
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
+ continue;
+
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
+
+ Value *Ptr = LI->getPointerOperand();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
+
+ // Make sure all the users of a vector are constant-index extracts.
+ if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
+ }))
+ continue;
+
+ // Save the load locations.
+ const ChainID ID = getChainID(Ptr, DL);
+ LoadRefs[ID].push_back(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isSimple())
+ continue;
+
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeStore(SI))
+ continue;
+
+ Type *Ty = SI->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
+ continue;
+
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
+
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
+ continue;
+
+ Value *Ptr = SI->getPointerOperand();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
+
+ if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
+ }))
+ continue;
+
+ // Save store location.
+ const ChainID ID = getChainID(Ptr, DL);
+ StoreRefs[ID].push_back(SI);
+ }
+ }
+
+ return {LoadRefs, StoreRefs};
+}
+
+bool Vectorizer::vectorizeChains(InstrListMap &Map) {
+ bool Changed = false;
+
+ for (const std::pair<ChainID, InstrList> &Chain : Map) {
+ unsigned Size = Chain.second.size();
+ if (Size < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+
+ // Process the stores in chunks of 64.
+ for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
+ unsigned Len = std::min<unsigned>(CE - CI, 64);
+ ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
+ Changed |= vectorizeInstructions(Chunk);
+ }
+ }
+
+ return Changed;
+}
+
+bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
+ LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+ << " instructions.\n");
+ SmallVector<int, 16> Heads, Tails;
+ int ConsecutiveChain[64];
+
+ // Do a quadratic search on all of the given loads/stores and find all of the
+ // pairs of loads/stores that follow each other.
+ for (int i = 0, e = Instrs.size(); i < e; ++i) {
+ ConsecutiveChain[i] = -1;
+ for (int j = e - 1; j >= 0; --j) {
+ if (i == j)
+ continue;
+
+ if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
+ if (ConsecutiveChain[i] != -1) {
+ int CurDistance = std::abs(ConsecutiveChain[i] - i);
+ int NewDistance = std::abs(ConsecutiveChain[i] - j);
+ if (j < i || NewDistance > CurDistance)
+ continue; // Should not insert.
+ }
+
+ Tails.push_back(j);
+ Heads.push_back(i);
+ ConsecutiveChain[i] = j;
+ }
+ }
+ }
+
+ bool Changed = false;
+ SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+
+ for (int Head : Heads) {
+ if (InstructionsProcessed.count(Instrs[Head]))
+ continue;
+ bool LongerChainExists = false;
+ for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
+ if (Head == Tails[TIt] &&
+ !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
+ LongerChainExists = true;
+ break;
+ }
+ if (LongerChainExists)
+ continue;
+
+ // We found an instr that starts a chain. Now follow the chain and try to
+ // vectorize it.
+ SmallVector<Instruction *, 16> Operands;
+ int I = Head;
+ while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
+ if (InstructionsProcessed.count(Instrs[I]))
+ break;
+
+ Operands.push_back(Instrs[I]);
+ I = ConsecutiveChain[I];
+ }
+
+ bool Vectorized = false;
+ if (isa<LoadInst>(*Operands.begin()))
+ Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
+ else
+ Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+
+ Changed |= Vectorized;
+ }
+
+ return Changed;
+}
+
+bool Vectorizer::vectorizeStoreChain(
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+ StoreInst *S0 = cast<StoreInst>(Chain[0]);
+
+ // If the vector has an int element, default to int for the whole store.
+ Type *StoreTy = nullptr;
+ for (Instruction *I : Chain) {
+ StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
+ if (StoreTy->isIntOrIntVectorTy())
+ break;
+
+ if (StoreTy->isPtrOrPtrVectorTy()) {
+ StoreTy = Type::getIntNTy(F.getParent()->getContext(),
+ DL.getTypeSizeInBits(StoreTy));
+ break;
+ }
+ }
+ assert(StoreTy && "Failed to find store type");
+
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned AS = S0->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / Sz;
+ unsigned ChainSize = Chain.size();
+ unsigned Alignment = getAlignment(S0);
+
+ if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+ if (NewChain.size() == 1) {
+ // Failed after the first instruction. Discard it and try the smaller chain.
+ InstructionsProcessed->insert(NewChain.front());
+ return false;
+ }
+
+ // Update Chain to the valid vectorizable subchain.
+ Chain = NewChain;
+ ChainSize = Chain.size();
+
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+
+ VectorType *VecTy;
+ VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
+ if (VecStoreTy)
+ VecTy = VectorType::get(StoreTy->getScalarType(),
+ Chain.size() * VecStoreTy->getNumElements());
+ else
+ VecTy = VectorType::get(StoreTy, Chain.size());
+
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
+ return vectorizeStoreChain(Chain.slice(0, TargetVF),
+ InstructionsProcessed) |
+ vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: Stores to vectorize:\n";
+ for (Instruction *I : Chain)
+ dbgs() << " " << *I << "\n";
+ });
+
+ // We won't try again to vectorize the elements of the chain, regardless of
+ // whether we succeed below.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+ // If the store is going to be misaligned, don't vectorize it.
+ if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+ if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
+ }
+
+ unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+ StackAdjustedAlignment,
+ DL, S0, nullptr, &DT);
+ if (NewAlign != 0)
+ Alignment = NewAlign;
+ }
+
+ if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
+ }
+
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
+ Builder.SetInsertPoint(&*Last);
+
+ Value *Vec = UndefValue::get(VecTy);
+
+ if (VecStoreTy) {
+ unsigned VecWidth = VecStoreTy->getNumElements();
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ StoreInst *Store = cast<StoreInst>(Chain[I]);
+ for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
+ unsigned NewIdx = J + I * VecWidth;
+ Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
+ Builder.getInt32(J));
+ if (Extract->getType() != StoreTy->getScalarType())
+ Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
+
+ Value *Insert =
+ Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
+ Vec = Insert;
+ }
+ }
+ } else {
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ StoreInst *Store = cast<StoreInst>(Chain[I]);
+ Value *Extract = Store->getValueOperand();
+ if (Extract->getType() != StoreTy->getScalarType())
+ Extract =
+ Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
+
+ Value *Insert =
+ Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
+ Vec = Insert;
+ }
+ }
+
+ StoreInst *SI = Builder.CreateAlignedStore(
+ Vec,
+ Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
+ Alignment);
+ propagateMetadata(SI, Chain);
+
+ eraseInstructions(Chain);
+ ++NumVectorInstructions;
+ NumScalarsVectorized += Chain.size();
+ return true;
+}
+
+bool Vectorizer::vectorizeLoadChain(
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+ LoadInst *L0 = cast<LoadInst>(Chain[0]);
+
+ // If the vector has an int element, default to int for the whole load.
+ Type *LoadTy = nullptr;
+ for (const auto &V : Chain) {
+ LoadTy = cast<LoadInst>(V)->getType();
+ if (LoadTy->isIntOrIntVectorTy())
+ break;
+
+ if (LoadTy->isPtrOrPtrVectorTy()) {
+ LoadTy = Type::getIntNTy(F.getParent()->getContext(),
+ DL.getTypeSizeInBits(LoadTy));
+ break;
+ }
+ }
+ assert(LoadTy && "Can't determine LoadInst type from chain");
+
+ unsigned Sz = DL.getTypeSizeInBits(LoadTy);
+ unsigned AS = L0->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / Sz;
+ unsigned ChainSize = Chain.size();
+ unsigned Alignment = getAlignment(L0);
+
+ if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+ if (NewChain.size() == 1) {
+ // Failed after the first instruction. Discard it and try the smaller chain.
+ InstructionsProcessed->insert(NewChain.front());
+ return false;
+ }
+
+ // Update Chain to the valid vectorizable subchain.
+ Chain = NewChain;
+ ChainSize = Chain.size();
+
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+ VectorType *VecTy;
+ VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
+ if (VecLoadTy)
+ VecTy = VectorType::get(LoadTy->getScalarType(),
+ Chain.size() * VecLoadTy->getNumElements());
+ else
+ VecTy = VectorType::get(LoadTy, Chain.size());
+
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
+ return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
+ vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+ }
+
+ // We won't try again to vectorize the elements of the chain, regardless of
+ // whether we succeed below.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+ // If the load is going to be misaligned, don't vectorize it.
+ if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+ if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
+
+ Alignment = getOrEnforceKnownAlignment(
+ L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
+ }
+
+ if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: Loads to vectorize:\n";
+ for (Instruction *I : Chain)
+ I->dump();
+ });
+
+ // getVectorizablePrefix already computed getBoundaryInstrs. The value of
+ // Last may have changed since then, but the value of First won't have. If it
+ // matters, we could compute getBoundaryInstrs only once and reuse it here.
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
+ Builder.SetInsertPoint(&*First);
+
+ Value *Bitcast =
+ Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
+ LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+ propagateMetadata(LI, Chain);
+
+ if (VecLoadTy) {
+ SmallVector<Instruction *, 16> InstrsToErase;
+
+ unsigned VecWidth = VecLoadTy->getNumElements();
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ for (auto Use : Chain[I]->users()) {
+ // All users of vector loads are ExtractElement instructions with
+ // constant indices, otherwise we would have bailed before now.
+ Instruction *UI = cast<Instruction>(Use);
+ unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
+ unsigned NewIdx = Idx + I * VecWidth;
+ Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
+ UI->getName());
+ if (V->getType() != UI->getType())
+ V = Builder.CreateBitCast(V, UI->getType());
+
+ // Replace the old instruction.
+ UI->replaceAllUsesWith(V);
+ InstrsToErase.push_back(UI);
+ }
+ }
+
+ // Bitcast might not be an Instruction, if the value being loaded is a
+ // constant. In that case, no need to reorder anything.
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
+
+ for (auto I : InstrsToErase)
+ I->eraseFromParent();
+ } else {
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ Value *CV = Chain[I];
+ Value *V =
+ Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
+ if (V->getType() != CV->getType()) {
+ V = Builder.CreateBitOrPointerCast(V, CV->getType());
+ }
+
+ // Replace the old instruction.
+ CV->replaceAllUsesWith(V);
+ }
+
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
+ }
+
+ eraseInstructions(Chain);
+
+ ++NumVectorInstructions;
+ NumScalarsVectorized += Chain.size();
+ return true;
+}
+
+bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+ unsigned Alignment) {
+ if (Alignment % SzInBytes == 0)
+ return false;
+
+ bool Fast = false;
+ bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+ SzInBytes * 8, AddressSpace,
+ Alignment, &Fast);
+ LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+ << " and fast? " << Fast << "\n";);
+ return !Allows || !Fast;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
new file mode 100644
index 000000000000..f43842be5357
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -0,0 +1,1241 @@
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+extern cl::opt<bool> EnableVPlanPredication;
+
+static cl::opt<bool>
+ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ case HK_ISVECTORIZED:
+ case HK_PREDICATE:
+ return (Val == 0 || Val == 1);
+ }
+ return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+ bool InterleaveOnlyWhenForced,
+ OptimizationRemarkEmitter &ORE)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+ Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+ Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),
+ ORE(ORE) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
+ IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+ LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+}
+
+void LoopVectorizeHints::setAlreadyVectorized() {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+
+ MDNode *IsVectorizedMD = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.isvectorized"),
+ ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
+ MDNode *LoopID = TheLoop->getLoopID();
+ MDNode *NewLoopID =
+ makePostTransformationMetadata(Context, LoopID,
+ {Twine(Prefix(), "vectorize.").str(),
+ Twine(Prefix(), "interleave.").str()},
+ {IsVectorizedMD});
+ TheLoop->setLoopID(NewLoopID);
+
+ // Update internal cache.
+ IsVectorized.Value = 1;
+}
+
+bool LoopVectorizeHints::allowVectorization(
+ Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (getIsVectorized() == 1) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ "AllDisabled", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: vectorization and interleaving are "
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+ using namespace ore;
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(), TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
+ }
+ });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
+ if (getWidth() == 1)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ return LV_NAME;
+ return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C)
+ return;
+ unsigned Val = C->getZExtValue();
+
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+ Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *PassName = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
+ Failed = true;
+ }
+
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
+ }
+
+ return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+// 1) it has a canonical IV (starting from 0 and with stride 1),
+// 2) its latch terminator is a conditional branch and,
+// 3) its latch condition is a compare instruction whose operands are the
+// canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+ assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+ // If Lp is the outer loop, it's uniform by definition.
+ if (Lp == OuterLp)
+ return true;
+ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+ // 1.
+ PHINode *IV = Lp->getCanonicalInductionVariable();
+ if (!IV) {
+ LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+ return false;
+ }
+
+ // 2.
+ BasicBlock *Latch = Lp->getLoopLatch();
+ auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+ return false;
+ }
+
+ // 3.
+ auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+ if (!LatchCmp) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+ return false;
+ }
+
+ Value *CondOp0 = LatchCmp->getOperand(0);
+ Value *CondOp1 = LatchCmp->getOperand(1);
+ Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+ if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+ !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+ LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+ if (!isUniformLoop(Lp, OuterLp))
+ return false;
+
+ // Check if nested loops are uniform.
+ for (Loop *SubLp : *Lp)
+ if (!isUniformLoopNest(SubLp, OuterLp))
+ return false;
+
+ return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (PHINode &Phi : BB->phis()) {
+ for (Value *V : Phi.incoming_values())
+ if (auto *C = dyn_cast<Constant>(V))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty);
+
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
+ return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ // Reductions, Inductions and non-header phis are allowed to have exit users. All
+ // other instructions must not have external users.
+ if (!AllowedExit.count(Inst))
+ // Check that all of the users of the loop are inside the BB.
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(UI)) {
+ LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ const ValueToValueMap &Strides =
+ getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
+ bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
+ if (Stride == 1 || Stride == -1)
+ return Stride;
+ return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+ assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check whether the BB terminator is a BranchInst. Any other terminator is
+ // not supported yet.
+ auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br) {
+ reportVectorizationFailure("Unsupported basic block terminator",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether the BranchInst is a supported one. Only unconditional
+ // branches, conditional branches with an outer loop invariant condition or
+ // backedges are supported.
+ // FIXME: We skip these checks when VPlan predication is enabled as we
+ // want to allow divergent branches. This whole check will be removed
+ // once VPlan predication is on by default.
+ if (!EnableVPlanPredication && Br && Br->isConditional() &&
+ !TheLoop->isLoopInvariant(Br->getCondition()) &&
+ !LI->isLoopHeader(Br->getSuccessor(0)) &&
+ !LI->isLoopHeader(Br->getSuccessor(1))) {
+ reportVectorizationFailure("Unsupported conditional branch",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
+ // Check whether inner loops are uniform. At this point, we only support
+ // simple outer loops scenarios with uniform nested loops.
+ if (!isUniformLoopNest(TheLoop /*loop nest*/,
+ TheLoop /*context outer loop*/)) {
+ reportVectorizationFailure("Outer loop contains divergent loops",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether we are able to set up outer loop induction.
+ if (!setupOuterLoopInductions()) {
+ reportVectorizationFailure("Unsupported outer loop Phi(s)",
+ "Unsupported outer loop Phi(s)",
+ "UnsupportedPhi", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+ PHINode *Phi, const InductionDescriptor &ID,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
+ Type *PhiTy = Phi->getType();
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+ // Get the widest type.
+ if (!PhiTy->isFloatingPointTy()) {
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ }
+
+ // Int inductions are special because we only allow one IV.
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
+ if (!PrimaryInduction || PhiTy == WidestIndTy)
+ PrimaryInduction = Phi;
+ }
+
+ // Both the PHI node itself, and the "post-increment" value feeding
+ // back into the PHI node may have external users.
+ // We can allow those uses, except if the SCEVs we have for them rely
+ // on predicates that only hold within the loop, since allowing the exit
+ // currently means re-using this SCEV outside the loop (see PR33706 for more
+ // details).
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(Phi);
+ AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::setupOuterLoopInductions() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Returns true if a given Phi is a supported induction.
+ auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
+ ID.getKind() == InductionDescriptor::IK_IntInduction) {
+ addInductionPhi(&Phi, ID, AllowedExit);
+ return true;
+ } else {
+ // Bail out for any Phi in the outer loop header that is not a supported
+ // induction.
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Found unsupported PHI for outer loop vectorization.\n");
+ return false;
+ }
+ };
+
+ if (llvm::all_of(Header->phis(), isSupportedPhi))
+ return true;
+ else
+ return false;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ HasFunNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+ // For each block in the loop.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Scan the instructions in the block and look for hazards.
+ for (Instruction &I : *BB) {
+ if (auto *Phi = dyn_cast<PHINode>(&I)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ reportVectorizationFailure("Found a non-int non-pointer PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ return false;
+ }
+
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (BB != Header) {
+ // Non-header phi nodes that have outside uses can be vectorized. Add
+ // them to the list of allowed exits.
+ // Unsafe cyclic dependencies with header phis are identified during
+ // legalization for reduction, induction and first order
+ // recurrences.
+ AllowedExit.insert(&I);
+ continue;
+ }
+
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ reportVectorizationFailure("Found an invalid PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop, Phi);
+ return false;
+ }
+
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+ DT)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ continue;
+ }
+
+ // TODO: Instead of recording the AllowedExit, it would be good to record the
+ // complementary set: NotAllowedExit. These include (but may not be
+ // limited to):
+ // 1. Reduction phis as they represent the one-before-last value, which
+ // is not available when vectorized
+ // 2. Induction phis and increment when SCEV predicates cannot be used
+ // outside the loop - see addInductionPhi
+ // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+ // outside the loop - see call to hasOutsideLoopUser in the non-phi
+ // handling below
+ // 4. FirstOrderRecurrence phis that can possibly be handled by
+ // extraction.
+ // By recording these, we can then reason about ways to vectorize each
+ // of these NotAllowedExit.
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+ Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+ SinkAfter, DT)) {
+ FirstOrderRecurrences.insert(Phi);
+ continue;
+ }
+
+ // As a last resort, coerce the PHI to a AddRec expression
+ // and re-try classifying it a an induction PHI.
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ continue;
+ }
+
+ reportVectorizationFailure("Found an unidentified PHI",
+ "value that could not be identified as "
+ "reduction is used outside the loop",
+ "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
+ return false;
+ } // end of PHI handling
+
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+ !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+ // If the call is a recognized math libary call, it is likely that
+ // we can vectorize it given loosened floating-point constraints.
+ LibFunc Func;
+ bool IsMathLibCall =
+ TLI && CI->getCalledFunction() &&
+ CI->getType()->isFloatingPointTy() &&
+ TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+ TLI->hasOptimizedCodeGen(Func);
+
+ if (IsMathLibCall) {
+ // TODO: Ideally, we should not use clang-specific language here,
+ // but it's hard to provide meaningful yet generic advice.
+ // Also, should this be guarded by allowExtraAnalysis() and/or be part
+ // of the returned info from isFunctionVectorizable()?
+ reportVectorizationFailure("Found a non-intrinsic callsite",
+ "library call cannot be vectorized. "
+ "Try compiling with -fno-math-errno, -ffast-math, "
+ "or similar flags",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ } else {
+ reportVectorizationFailure("Found a non-intrinsic callsite",
+ "call instruction cannot be vectorized",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ }
+ return false;
+ }
+
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized (i.e. loop invariant).
+ if (CI) {
+ auto *SE = PSE.getSE();
+ Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+ if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
+ reportVectorizationFailure("Found unvectorizable intrinsic",
+ "intrinsic instruction cannot be vectorized",
+ "CantVectorizeIntrinsic", ORE, TheLoop, CI);
+ return false;
+ }
+ }
+ }
+
+ // Check that the instruction return type is vectorizable.
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(I.getType()) &&
+ !I.getType()->isVoidTy()) ||
+ isa<ExtractElementInst>(I)) {
+ reportVectorizationFailure("Found unvectorizable type",
+ "instruction return type cannot be vectorized",
+ "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ reportVectorizationFailure("Store instruction cannot be vectorized",
+ "store instruction cannot be vectorized",
+ "CantVectorizeStore", ORE, TheLoop, ST);
+ return false;
+ }
+
+ // For nontemporal stores, check that a nontemporal vector version is
+ // supported on the target.
+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+ // Arbitrarily try a vector of 2 elements.
+ Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of stored type");
+ const MaybeAlign Alignment = getLoadStoreAlignment(ST);
+ assert(Alignment && "Alignment should be set");
+ if (!TTI->isLegalNTStore(VecTy, *Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal store instruction cannot be vectorized",
+ "nontemporal store instruction cannot be vectorized",
+ "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
+ return false;
+ }
+ }
+
+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+ // For nontemporal loads, check that a nontemporal vector version is
+ // supported on the target (arbitrarily try a vector of 2 elements).
+ Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of load type");
+ const MaybeAlign Alignment = getLoadStoreAlignment(LD);
+ assert(Alignment && "Alignment should be set");
+ if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal load instruction cannot be vectorized",
+ "nontemporal load instruction cannot be vectorized",
+ "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
+ return false;
+ }
+ }
+
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+ !I.isFast()) {
+ LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
+ }
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ // We can safely vectorize loops where instructions within the loop are
+ // used outside the loop only if the SCEV predicates within the loop is
+ // same as outside the loop. Allowing the exit means reusing the SCEV
+ // outside the loop.
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(&I);
+ continue;
+ }
+ reportVectorizationFailure("Value cannot be used outside the loop",
+ "value cannot be used outside the loop",
+ "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+ return false;
+ }
+ } // next instr.
+ }
+
+ if (!PrimaryInduction) {
+ if (Inductions.empty()) {
+ reportVectorizationFailure("Did not find one integer induction var",
+ "loop induction variable could not be identified",
+ "NoInductionVariable", ORE, TheLoop);
+ return false;
+ } else if (!WidestIndTy) {
+ reportVectorizationFailure("Did not find one integer induction var",
+ "integer loop induction variable could not be identified",
+ "NoIntegerInductionVariable", ORE, TheLoop);
+ return false;
+ } else {
+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ }
+ }
+
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+ PrimaryInduction = nullptr;
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+ LAI = &(*GetLAA)(*TheLoop);
+ const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+ if (LAR) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
+ }
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ reportVectorizationFailure("Stores to a uniform address",
+ "write to a loop invariant address could not be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+ return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+ Value *In0 = const_cast<Value *>(V);
+ PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+ if (!PN)
+ return false;
+
+ return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+ return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
+ BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
+ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+ for (Instruction &I : *BB) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Value *Operand : I.operands()) {
+ if (auto *C = dyn_cast<Constant>(Operand))
+ if (C->canTrap())
+ return false;
+ }
+ // We might be able to hoist the load.
+ if (I.mayReadFromMemory()) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
+ // !llvm.mem.parallel_loop_access implies if-conversion safety.
+ // Otherwise, record that the load needs (real or emulated) masking
+ // and let the cost model decide.
+ if (!IsAnnotatedParallel || PreserveGuards)
+ MaskedOp.insert(LI);
+ continue;
+ }
+ }
+
+ if (I.mayWriteToMemory()) {
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ return false;
+ // Predicated store requires some form of masking:
+ // 1) masked store HW instruction,
+ // 2) emulation via load-blend-store (only if safe and legal to do so,
+ // be aware on the race conditions), or
+ // 3) element-by-element predicate check and scalar store.
+ MaskedOp.insert(SI);
+ continue;
+ }
+ if (I.mayThrow())
+ return false;
+ }
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion) {
+ reportVectorizationFailure("If-conversion is disabled",
+ "if-conversion is disabled",
+ "IfConversionDisabled",
+ ORE, TheLoop);
+ return false;
+ }
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+ // A list of pointers which are known to be dereferenceable within scope of
+ // the loop body for each iteration of the loop which executes. That is,
+ // the memory pointed to can be dereferenced (with the access size implied by
+ // the value's type) unconditionally within the loop header without
+ // introducing a new fault.
+ SmallPtrSet<Value *, 8> SafePointes;
+
+ // Collect safe addresses.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!blockNeedsPredication(BB)) {
+ for (Instruction &I : *BB)
+ if (auto *Ptr = getLoadStorePointerOperand(&I))
+ SafePointes.insert(Ptr);
+ continue;
+ }
+
+ // For a block which requires predication, a address may be safe to access
+ // in the loop w/o predication if we can prove dereferenceability facts
+ // sufficient to ensure it'll never fault within the loop. For the moment,
+ // we restrict this to loads; stores are more complicated due to
+ // concurrency restrictions.
+ ScalarEvolution &SE = *PSE.getSE();
+ for (Instruction &I : *BB) {
+ LoadInst *LI = dyn_cast<LoadInst>(&I);
+ if (LI && !mustSuppressSpeculation(*LI) &&
+ isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
+ SafePointes.insert(LI->getPointerOperand());
+ }
+ }
+
+ // Collect the blocks that need predication.
+ BasicBlock *Header = TheLoop->getHeader();
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // We don't support switch statements inside loops.
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ reportVectorizationFailure("Loop contains a switch statement",
+ "loop contains a switch statement",
+ "LoopContainsSwitch", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB)) {
+ if (!blockCanBePredicated(BB, SafePointes)) {
+ reportVectorizationFailure(
+ "Control flow cannot be substituted for a select",
+ "control flow cannot be substituted for a select",
+ "NoCFGForSelect", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ reportVectorizationFailure(
+ "Control flow cannot be substituted for a select",
+ "control flow cannot be substituted for a select",
+ "NoCFGForSelect", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+ }
+
+ // We can if-convert this loop.
+ return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+ bool UseVPlanNativePath) {
+ assert((UseVPlanNativePath || Lp->empty()) &&
+ "VPlan-native path is not enabled.");
+
+ // TODO: ORE should be improved to show more accurate information when an
+ // outer loop can't be vectorized because a nested loop is not understood or
+ // legal. Something like: "outer_loop_location: loop not vectorized:
+ // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!Lp->getLoopPreheader()) {
+ reportVectorizationFailure("Loop doesn't have a legal pre-header",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (Lp->getNumBackEdges() != 1) {
+ reportVectorizationFailure("The loop must have a single backedge",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single exiting block.
+ if (!Lp->getExitingBlock()) {
+ reportVectorizationFailure("The loop must have an exiting block",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We only handle bottom-tested loops, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
+ reportVectorizationFailure("The exiting block is not the loop latch",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+ Loop *Lp, bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Recursively check whether the loop control flow of nested loops is
+ // understood.
+ for (Loop *SubLp : *Lp)
+ if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ // Check whether the loop-related control flow in the loop nest is expected by
+ // vectorizer.
+ if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We need to have a loop header.
+ LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+ << '\n');
+
+ // Specific checks for outer loops. We skip the remaining legal checks at this
+ // point because they don't support outer loops.
+ if (!TheLoop->empty()) {
+ assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+ if (!canVectorizeOuterLoop()) {
+ reportVectorizationFailure("Unsupported outer loop",
+ "unsupported outer loop",
+ "UnsupportedOuterLoop",
+ ORE, TheLoop);
+ // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+ // outer loops.
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+ return Result;
+ }
+
+ assert(TheLoop->empty() && "Inner loop expected.");
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
+
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ reportVectorizationFailure("Too many SCEV checks needed",
+ "Too many SCEV assumptions need to be made and checked at runtime",
+ "TooManySCEVRunTimeChecks", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Okay! We've done all the tests. If any have failed, return false. Otherwise
+ // we can vectorize, and at this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return Result;
+}
+
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
+
+ LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+ if (!PrimaryInduction) {
+ reportVectorizationFailure(
+ "No primary induction, cannot fold tail by masking",
+ "Missing a primary induction variable in the loop, which is "
+ "needed in order to fold tail by masking as required.",
+ "NoPrimaryInduction", ORE, TheLoop);
+ return false;
+ }
+
+ SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+
+ for (auto &Reduction : *getReductionVars())
+ ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+ // TODO: handle non-reduction outside users when tail is folded by masking.
+ for (auto *AE : AllowedExit) {
+ // Check that all users of allowed exit values are inside the loop or
+ // are the live-out of a reduction.
+ if (ReductionLiveOuts.count(AE))
+ continue;
+ for (User *U : AE->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (TheLoop->contains(UI))
+ continue;
+ reportVectorizationFailure(
+ "Cannot fold tail by masking, loop has an outside user for",
+ "Cannot fold tail by masking in the presence of live outs.",
+ "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+ return false;
+ }
+ }
+
+ // The list of pointers that we can safely read and write to remains empty.
+ SmallPtrSet<Value *, 8> SafePointers;
+
+ // Check and mark all blocks for predication, including those that ordinarily
+ // do not need predication such as the header block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
+ reportVectorizationFailure(
+ "Cannot fold tail by masking as required",
+ "control flow cannot be substituted for a select",
+ "NoCFGForSelect", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+ return true;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
new file mode 100644
index 000000000000..a5e85f27fabf
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -0,0 +1,287 @@
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+private:
+ VPBasicBlock *BB = nullptr;
+ VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ ArrayRef<VPValue *> Operands) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ if (BB)
+ BB->insert(Instr, InsertPt);
+ return Instr;
+ }
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ }
+
+public:
+ VPBuilder() {}
+
+ /// Clear the insertion point: created instructions will not be inserted into
+ /// a block.
+ void clearInsertionPoint() {
+ BB = nullptr;
+ InsertPt = VPBasicBlock::iterator();
+ }
+
+ VPBasicBlock *getInsertBlock() const { return BB; }
+ VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+ /// InsertPoint - A saved insertion point.
+ class VPInsertPoint {
+ VPBasicBlock *Block = nullptr;
+ VPBasicBlock::iterator Point;
+
+ public:
+ /// Creates a new insertion point which doesn't point to anything.
+ VPInsertPoint() = default;
+
+ /// Creates a new insertion point at the given location.
+ VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+ : Block(InsertBlock), Point(InsertPoint) {}
+
+ /// Returns true if this insert point is set.
+ bool isSet() const { return Block != nullptr; }
+
+ VPBasicBlock *getBlock() const { return Block; }
+ VPBasicBlock::iterator getPoint() const { return Point; }
+ };
+
+ /// Sets the current insert point to a previously-saved location.
+ void restoreIP(VPInsertPoint IP) {
+ if (IP.isSet())
+ setInsertPoint(IP.getBlock(), IP.getPoint());
+ else
+ clearInsertionPoint();
+ }
+
+ /// This specifies that created VPInstructions should be appended to the end
+ /// of the specified block.
+ void setInsertPoint(VPBasicBlock *TheBB) {
+ assert(TheBB && "Attempting to set a null insert point");
+ BB = TheBB;
+ InsertPt = BB->end();
+ }
+
+ /// This specifies that created instructions should be inserted at the
+ /// specified point.
+ void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+ BB = TheBB;
+ InsertPt = IP;
+ }
+
+ /// Insert and return the specified instruction.
+ VPInstruction *insert(VPInstruction *I) const {
+ BB->insert(I, InsertPt);
+ return I;
+ }
+
+ /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+ /// its underlying Instruction.
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ NewVPInst->setUnderlyingValue(Inst);
+ return NewVPInst;
+ }
+ VPValue *createNaryOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ }
+
+ VPValue *createNot(VPValue *Operand) {
+ return createInstruction(VPInstruction::Not, {Operand});
+ }
+
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ }
+
+ VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ }
+
+ //===--------------------------------------------------------------------===//
+ // RAII helpers.
+ //===--------------------------------------------------------------------===//
+
+ /// RAII object that stores the current insertion point and restores it when
+ /// the object is destroyed.
+ class InsertPointGuard {
+ VPBuilder &Builder;
+ VPBasicBlock *Block;
+ VPBasicBlock::iterator Point;
+
+ public:
+ InsertPointGuard(VPBuilder &B)
+ : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+ InsertPointGuard(const InsertPointGuard &) = delete;
+ InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+ ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+ };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+ // Vector width with best cost
+ unsigned Width;
+ // Cost of the loop with that width
+ unsigned Cost;
+
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ static VectorizationFactor Disabled() { return {1, 0}; }
+
+ bool operator==(const VectorizationFactor &rhs) const {
+ return Width == rhs.Width && Cost == rhs.Cost;
+ }
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitability analysis.
+ LoopVectorizationCostModel &CM;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
+ unsigned BestVF = 0;
+ unsigned BestUF = 0;
+
+public:
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
+
+ /// Plan how to best vectorize, return the best VF and its cost, or None if
+ /// vectorization and interleaving should be avoided up front.
+ Optional<VectorizationFactor> plan(unsigned UserVF);
+
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
+ VectorizationFactor planInVPlanNativePath(unsigned UserVF);
+
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
+
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ static bool
+ getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range);
+
+ /// Build a VPlan using VPRecipes according to the information gather by
+ /// Legal. This method is only used for the legacy inner loop vectorizer.
+ VPlanPtr
+ buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+ void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 000000000000..8f0bf70f873c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,7914 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+// of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
+//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// The interleaved access vectorization is based on the paper:
+// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
+// Data for SIMD
+//
+// Other ideas/concepts are from:
+// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
+// Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanHCFGBuilder.h"
+#include "VPlanHCFGTransforms.h"
+#include "VPlanPredicator.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopVectorizeFollowupAll =
+ "llvm.loop.vectorize.followup_all";
+static const char *const LLVMLoopVectorizeFollowupVectorized =
+ "llvm.loop.vectorize.followup_vectorized";
+static const char *const LLVMLoopVectorizeFollowupEpilogue =
+ "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
+static cl::opt<unsigned> TinyTripCountVectorThreshold(
+ "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
+ cl::desc("Loops with a constant trip count that is smaller than this "
+ "value are vectorized only if no scalar iteration overheads "
+ "are incurred."));
+
+// Indicates that an epilogue is undesired, predication is preferred.
+// This means that the vectorizer will try to fold the loop-tail (epilogue)
+// into the loop and predicate the loop body accordingly.
+static cl::opt<bool> PreferPredicateOverEpilog(
+ "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
+ cl::desc("Indicate that an epilogue is undesired, predication should be "
+ "used instead."));
+
+static cl::opt<bool> MaximizeBandwidth(
+ "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+ cl::desc("Maximize bandwidth when selecting vectorization factor which "
+ "will be determined by the smallest type in loop."));
+
+static cl::opt<bool> EnableInterleavedMemAccesses(
+ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+
+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps.
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+ "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
+/// We don't interleave loops with a known constant trip count below this
+/// number.
+static const unsigned TinyTripCountInterleaveThreshold = 128;
+
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+ "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+ "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of vector registers."));
+
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+ "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "scalar loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+ "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+ "force-target-instruction-cost", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's expected cost for "
+ "an instruction to a single constant value. Mostly "
+ "useful for getting consistent testing."));
+
+static cl::opt<unsigned> SmallLoopCost(
+ "small-loop-cost", cl::init(20), cl::Hidden,
+ cl::desc(
+ "The cost of a loop that is considered 'small' by the interleaver."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+ "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to access PGO "
+ "heuristics minimizing code growth in cold regions and being more "
+ "aggressive in hot regions."));
+
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+ "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable runtime interleaving until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+ "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+ cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+ "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+ cl::desc("Count the induction variable only once when interleaving"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+ "enable-cond-stores-vec", cl::init(true), cl::Hidden,
+ cl::desc("Enable if predication of stores during vectorization."));
+
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+ "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+ cl::desc("The maximum interleave count to use when interleaving a scalar "
+ "reduction in a nested loop."));
+
+cl::opt<bool> EnableVPlanNativePath(
+ "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path with "
+ "support for outer loop vectorization."));
+
+// FIXME: Remove this switch once we have divergence analysis. Currently we
+// assume divergent non-backedge branches when this switch is true.
+cl::opt<bool> EnableVPlanPredication(
+ "enable-vplan-predication", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path predicator with "
+ "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+ "vplan-build-stress-test", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Build VPlan for every supported loop nest in the function and bail "
+ "out right after the build (stress test the VPlan H-CFG construction "
+ "in the VPlan-native vectorization path)."));
+
+cl::opt<bool> llvm::EnableLoopInterleaving(
+ "interleave-loops", cl::init(true), cl::Hidden,
+ cl::desc("Enable loop interleaving in Loop vectorization passes"));
+cl::opt<bool> llvm::EnableLoopVectorization(
+ "vectorize-loops", cl::init(true), cl::Hidden,
+ cl::desc("Run the Loop vectorization passes"));
+
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type *ToVectorTy(Type *Scalar, unsigned VF) {
+ if (Scalar->isVoidTy() || VF == 1)
+ return Scalar;
+ return VectorType::get(Scalar, VF);
+}
+
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Expected Load or Store instruction");
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return LI->getType();
+ return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns true if the given type is irregular. The
+/// type is irregular if its allocated size doesn't equal the store size of an
+/// element of the corresponding vector type at the given vectorization factor.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
+ // Determine if an array of VF elements of type Ty is "bitcast compatible"
+ // with a <VF x Ty> vector.
+ if (VF > 1) {
+ auto *VectorTy = VectorType::get(Ty, VF);
+ return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
+ }
+
+ // If the vectorization factor is one, we just check if an array of type Ty
+ // requires padding between elements.
+ return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
+}
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+/// we always assume predicated blocks have a 50% chance of executing.
+static unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V))
+ cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
+ return V;
+}
+
+static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
+ if (isa<FPMathOperator>(V))
+ cast<Instruction>(V)->setFastMathFlags(FMF);
+ return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+ : ConstantFP::get(Ty, C);
+}
+
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+/// 1) Returns exact trip count if it is known.
+/// 2) Returns expected trip count according to profile data if any.
+/// 3) Returns upper bound estimate if it is known.
+/// 4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+ // Check if exact trip count is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+ return ExpectedTC;
+
+ // Check if there is an expected trip count available from profile data.
+ if (LoopVectorizeWithBlockFrequency)
+ if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+ return EstimatedTC;
+
+ // Check if upper bound estimate is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+ return ExpectedTC;
+
+ return None;
+}
+
+namespace llvm {
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+/// counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+/// instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+ InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+ unsigned UnrollFactor, LoopVectorizationLegality *LVL,
+ LoopVectorizationCostModel *CM)
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+ AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+ Builder(PSE.getSE()->getContext()),
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ virtual ~InnerLoopVectorizer() = default;
+
+ /// Create a new empty loop. Unlink the old loop and connect the new one.
+ /// Return the pre-header block of the new loop.
+ BasicBlock *createVectorizedLoopSkeleton();
+
+ /// Widen a single instruction within the innermost loop.
+ void widenInstruction(Instruction &I);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
+
+ // Return true if any runtime check is added.
+ bool areSafetyChecksAdded() { return AddedSafetyChecks; }
+
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in the
+ /// new unrolled loop, where UF is the unroll factor.
+ using VectorParts = SmallVector<Value *, 2>;
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+
+ /// A helper function to scalarize a single Instruction in the innermost loop.
+ /// Generates a sequence of scalar instances for each lane between \p MinLane
+ /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+ /// inclusive..
+ void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
+ bool IfPredicateInstr);
+
+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+ /// is provided, the integer induction variable will first be truncated to
+ /// the corresponding type.
+ void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+ /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+ /// vector or scalar value on-demand if one is not yet available. When
+ /// vectorizing a loop, we visit the definition of an instruction before its
+ /// uses. When visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
+ ///
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part. If the value has already been vectorized,
+ /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// a new vector value on-demand by inserting the scalar values into a vector
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into a vector.
+ Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll and vector indices \p Instance. If the value has been
+ /// vectorized but not scalarized, the necessary extractelement instruction
+ /// will be generated.
+ Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+ /// Try to vectorize the interleaved access group that \p Instr belongs to,
+ /// optionally masking the vector operations if \p BlockInMask is non-null.
+ void vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
+
+ /// Vectorize Load and Store instructions, optionally masking the vector
+ /// operations if \p BlockInMask is non-null.
+ void vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
+
+ /// Set the debug location in the builder using the debug location in
+ /// the instruction.
+ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+ /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
+ void fixNonInductionPHIs(void);
+
+protected:
+ friend class LoopVectorizationPlanner;
+
+ /// A small list of PHINodes.
+ using PhiVector = SmallVector<PHINode *, 4>;
+
+ /// A type for scalarized values in the new loop. Each value from the
+ /// original loop, when scalarized, is represented by UF x VF scalar values
+ /// in the new unrolled loop, where UF is the unroll factor and VF is the
+ /// vectorization factor.
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+
+ /// Set up the values of the IVs correctly when exiting the vector loop.
+ void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+ Value *CountRoundDown, Value *EndValue,
+ BasicBlock *MiddleBlock);
+
+ /// Create a new induction variable inside L.
+ PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+ Value *Step, Instruction *DL);
+
+ /// Handle all cross-iteration phis in the header.
+ void fixCrossIterationPHIs();
+
+ /// Fix a first-order recurrence. This is the second phase of vectorizing
+ /// this phi node.
+ void fixFirstOrderRecurrence(PHINode *Phi);
+
+ /// Fix a reduction cross-iteration phi. This is the second phase of
+ /// vectorizing this phi node.
+ void fixReduction(PHINode *Phi);
+
+ /// The Loop exit block may have single value PHI nodes with some
+ /// incoming value. While vectorizing we only handled real values
+ /// that were defined inside the loop and we should have one value for
+ /// each predecessor of its parent basic block. See PR14725.
+ void fixLCSSAPHIs();
+
+ /// Iteratively sink the scalarized operands of a predicated instruction into
+ /// the block that was created for it.
+ void sinkScalarOperands(Instruction *PredInst);
+
+ /// Shrinks vector element sizes to the smallest bitwidth they can be legally
+ /// represented as.
+ void truncateToMinimalBitwidths();
+
+ /// Insert the new loop to the loop hierarchy and pass manager
+ /// and update the analysis passes.
+ void updateAnalysis();
+
+ /// Create a broadcast instruction. This method generates a broadcast
+ /// instruction (shuffle) for loop invariant values and for the induction
+ /// value. If this is the induction variable then we extend it to N, N+1, ...
+ /// this is needed because each iteration in the loop corresponds to a SIMD
+ /// element.
+ virtual Value *getBroadcastInstrs(Value *V);
+
+ /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+ /// to each vector element of Val. The sequence starts at StartIndex.
+ /// \p Opcode is relevant for FP induction variable.
+ virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd);
+
+ /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+ /// variable on which to base the steps, \p Step is the size of the step, and
+ /// \p EntryVal is the value from the original loop that maps to the steps.
+ /// Note that \p EntryVal doesn't have to be an induction variable - it
+ /// can also be a truncate instruction.
+ void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
+ const InductionDescriptor &ID);
+
+ /// Create a vector induction phi node based on an existing scalar one. \p
+ /// EntryVal is the value from the original loop that maps to the vector phi
+ /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+ /// truncate instruction, instead of widening the original IV, we widen a
+ /// version of the IV truncated to \p EntryVal's type.
+ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+ Value *Step, Instruction *EntryVal);
+
+ /// Returns true if an instruction \p I should be scalarized instead of
+ /// vectorized for the chosen vectorization factor.
+ bool shouldScalarizeInstruction(Instruction *I) const;
+
+ /// Returns true if we should generate a scalar version of \p IV.
+ bool needsScalarInduction(Instruction *IV) const;
+
+ /// If there is a cast involved in the induction variable \p ID, which should
+ /// be ignored in the vectorized loop body, this function records the
+ /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
+ /// cast. We had already proved that the casted Phi is equal to the uncasted
+ /// Phi in the vectorized loop (under a runtime guard), and therefore
+ /// there is no need to vectorize the cast - the same value can be used in the
+ /// vector loop for both the Phi and the cast.
+ /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
+ /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
+ ///
+ /// \p EntryVal is the value from the original loop that maps to the vector
+ /// phi node and is used to distinguish what is the IV currently being
+ /// processed - original one (if \p EntryVal is a phi corresponding to the
+ /// original IV) or the "newly-created" one based on the proof mentioned above
+ /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+ /// latter case \p EntryVal is a TruncInst and we must not record anything for
+ /// that IV, but it's error-prone to expect callers of this routine to care
+ /// about that, hence this explicit parameter.
+ void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+ const Instruction *EntryVal,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
+
+ /// Generate a shuffle sequence that will reverse the vector Vec.
+ virtual Value *reverseVector(Value *Vec);
+
+ /// Returns (and creates if needed) the original loop trip count.
+ Value *getOrCreateTripCount(Loop *NewLoop);
+
+ /// Returns (and creates if needed) the trip count of the widened loop.
+ Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+ /// Returns a bitcasted value to the requested vector type.
+ /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
+ Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL);
+
+ /// Emit a bypass check to see if the vector trip count is zero, including if
+ /// it overflows.
+ void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+
+ /// Emit a bypass check to see if all of the SCEV assumptions we've
+ /// had to make are correct.
+ void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+
+ /// Emit bypass checks to check any memory assumptions we may have made.
+ void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
+ /// Compute the transformed value of Index at offset StartValue using step
+ /// StepValue.
+ /// For integer induction, returns StartValue + Index * StepValue.
+ /// For pointer induction, returns StartValue[Index * StepValue].
+ /// FIXME: The newly created binary instructions should contain nsw/nuw
+ /// flags, which can be found from the original scalar operations.
+ Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
+ const DataLayout &DL,
+ const InductionDescriptor &ID) const;
+
+ /// Add additional metadata to \p To that was not present on \p Orig.
+ ///
+ /// Currently this is used to add the noalias annotations based on the
+ /// inserted memchecks. Use this for instructions that are *cloned* into the
+ /// vector loop.
+ void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+ /// Add metadata from one instruction to another.
+ ///
+ /// This includes both the original MDs from \p From and additional ones (\see
+ /// addNewMetadata). Use this for *newly created* instructions in the vector
+ /// loop.
+ void addMetadata(Instruction *To, Instruction *From);
+
+ /// Similar to the previous function but it adds the metadata to a
+ /// vector of instructions.
+ void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
+ /// The original loop.
+ Loop *OrigLoop;
+
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+ /// dynamic knowledge to simplify SCEV expressions and converts them to a
+ /// more usable form.
+ PredicatedScalarEvolution &PSE;
+
+ /// Loop Info.
+ LoopInfo *LI;
+
+ /// Dominator Tree.
+ DominatorTree *DT;
+
+ /// Alias Analysis.
+ AliasAnalysis *AA;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// Assumption Cache.
+ AssumptionCache *AC;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ /// LoopVersioning. It's only set up (non-null) if memchecks were
+ /// used.
+ ///
+ /// This is currently only used to add no-alias metadata based on the
+ /// memchecks. The actually versioning is performed manually.
+ std::unique_ptr<LoopVersioning> LVer;
+
+ /// The vectorization SIMD factor to use. Each vector will have this many
+ /// vector elements.
+ unsigned VF;
+
+ /// The vectorization unroll factor to use. Each scalar is vectorized to this
+ /// many different vector instructions.
+ unsigned UF;
+
+ /// The builder that we use
+ IRBuilder<> Builder;
+
+ // --- Vectorization state ---
+
+ /// The vector-loop preheader.
+ BasicBlock *LoopVectorPreHeader;
+
+ /// The scalar-loop preheader.
+ BasicBlock *LoopScalarPreHeader;
+
+ /// Middle Block between the vector and the scalar.
+ BasicBlock *LoopMiddleBlock;
+
+ /// The ExitBlock of the scalar loop.
+ BasicBlock *LoopExitBlock;
+
+ /// The vector loop body.
+ BasicBlock *LoopVectorBody;
+
+ /// The scalar loop body.
+ BasicBlock *LoopScalarBody;
+
+ /// A list of all bypass blocks. The first block is the entry of the loop.
+ SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+ /// The new Induction variable which was added to the new block.
+ PHINode *Induction = nullptr;
+
+ /// The induction variable of the old basic block.
+ PHINode *OldInduction = nullptr;
+
+ /// Maps values from the original loop to their corresponding values in the
+ /// vectorized loop. A key value can map to either vector values, scalar
+ /// values or both kinds of values, depending on whether the key was
+ /// vectorized and scalarized.
+ VectorizerValueMap VectorLoopValueMap;
+
+ /// Store instructions that were predicated.
+ SmallVector<Instruction *, 4> PredicatedInstructions;
+
+ /// Trip count of the original loop.
+ Value *TripCount = nullptr;
+
+ /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+ Value *VectorTripCount = nullptr;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel *Cost;
+
+ // Record whether runtime checks are added.
+ bool AddedSafetyChecks = false;
+
+ // Holds the end values for each induction variable. We save the end values
+ // so we can later fix-up the external users of the induction variables.
+ DenseMap<PHINode *, Value *> IVEndValues;
+
+ // Vector of original scalar PHIs whose corresponding widened PHIs need to be
+ // fixed up at the end of vector code generation.
+ SmallVector<PHINode *, 8> OrigPHIsToFix;
+};
+
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+ InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
+ LoopVectorizationLegality *LVL,
+ LoopVectorizationCostModel *CM)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
+ UnrollFactor, LVL, CM) {}
+
+private:
+ Value *getBroadcastInstrs(Value *V) override;
+ Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd) override;
+ Value *reverseVector(Value *Vec) override;
+};
+
+} // end namespace llvm
+
+/// Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+ if (!I)
+ return I;
+
+ DebugLoc Empty;
+ if (I->getDebugLoc() != Empty)
+ return I;
+
+ for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+ if (OpInst->getDebugLoc() != Empty)
+ return OpInst;
+ }
+
+ return I;
+}
+
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+ const DILocation *DIL = Inst->getDebugLoc();
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst)) {
+ auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
+ if (NewDIL)
+ B.SetCurrentDebugLocation(NewDIL.getValue());
+ else
+ LLVM_DEBUG(dbgs()
+ << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ }
+ else
+ B.SetCurrentDebugLocation(DIL);
+ } else
+ B.SetCurrentDebugLocation(DebugLoc());
+}
+
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: Not vectorizing: " << DebugMsg;
+ if (I != nullptr)
+ dbgs() << " " << *I;
+ else
+ dbgs() << '.';
+ dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
+/// RemarkName is the identifier for the remark. If \p I is passed it is an
+/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
+/// the location of the remark. \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+ StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+ ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+ ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+ std::string Result;
+ if (L) {
+ raw_string_ostream OS(Result);
+ if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+ LoopDbgLoc.print(OS);
+ else
+ // Just print the module name.
+ OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+ OS.flush();
+ }
+ return Result;
+}
+#endif
+
+void InnerLoopVectorizer::addNewMetadata(Instruction *To,
+ const Instruction *Orig) {
+ // If the loop was versioned with memchecks, add the corresponding no-alias
+ // metadata.
+ if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+ LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void InnerLoopVectorizer::addMetadata(Instruction *To,
+ Instruction *From) {
+ propagateMetadata(To, From);
+ addNewMetadata(To, From);
+}
+
+void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
+ Instruction *From) {
+ for (Value *V : To) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ addMetadata(I, From);
+ }
+}
+
+namespace llvm {
+
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+ // The default: allowing scalar epilogues.
+ CM_ScalarEpilogueAllowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ CM_ScalarEpilogueNotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ CM_ScalarEpilogueNotNeededUsePredicate
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+ PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI, DemandedBits *DB,
+ AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, const Function *F,
+ const LoopVectorizeHints *Hints,
+ InterleavedAccessInfo &IAI)
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+ Hints(Hints), InterleaveInfo(IAI) {}
+
+ /// \return An upper bound for the vectorization factor, or None if
+ /// vectorization and interleaving should be avoided up front.
+ Optional<unsigned> computeMaxVF();
+
+ /// \return True if runtime checks are required for vectorization, and false
+ /// otherwise.
+ bool runtimeChecksRequired();
+
+ /// \return The most profitable vectorization factor and the cost of that VF.
+ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+ /// then this vectorization factor will be selected if vectorization is
+ /// possible.
+ VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+
+ /// Setup cost-based decisions for user vectorization factor.
+ void selectUserVectorizationFactor(unsigned UserVF) {
+ collectUniformsAndScalars(UserVF);
+ collectInstsToScalarize(UserVF);
+ }
+
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+ /// \return The desired interleave count.
+ /// If interleave count has been specified by metadata it will be returned.
+ /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+ /// are the selected vectorization factor and the cost of the selected VF.
+ unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
+
+ /// Memory access instruction may be vectorized in more than one way.
+ /// Form of instruction after vectorization depends on cost.
+ /// This function takes cost-based decisions for Load/Store instructions
+ /// and collects them in a map. This decisions map is used for building
+ /// the lists of loop-uniform and loop-scalar instructions.
+ /// The calculated cost is saved with widening decision in order to
+ /// avoid redundant calculations.
+ void setCostBasedWideningDecision(unsigned VF);
+
+ /// A struct that represents some properties of the register usage
+ /// of a loop.
+ struct RegisterUsage {
+ /// Holds the number of loop invariant values that are used in the loop.
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
+ /// Holds the maximum number of concurrent live intervals in the loop.
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
+ };
+
+ /// \return Returns information about the register usages of the loop for the
+ /// given vectorization factors.
+ SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+
+ /// Collect values we want to ignore in the cost model.
+ void collectValuesToIgnore();
+
+ /// \returns The smallest bitwidth each instruction can be represented with.
+ /// The vector equivalents of these instructions should be truncated to this
+ /// type.
+ const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+ return MinBWs;
+ }
+
+ /// \returns True if it is more profitable to scalarize instruction \p I for
+ /// vectorization factor \p VF.
+ bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto Scalars = InstsToScalarize.find(VF);
+ assert(Scalars != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return Scalars->second.find(I) != Scalars->second.end();
+ }
+
+ /// Returns true if \p I is known to be uniform after vectorization.
+ bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+ if (VF == 1)
+ return true;
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto UniformsPerVF = Uniforms.find(VF);
+ assert(UniformsPerVF != Uniforms.end() &&
+ "VF not yet analyzed for uniformity");
+ return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
+ }
+
+ /// Returns true if \p I is known to be scalar after vectorization.
+ bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+ if (VF == 1)
+ return true;
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto ScalarsPerVF = Scalars.find(VF);
+ assert(ScalarsPerVF != Scalars.end() &&
+ "Scalar values are not calculated for VF");
+ return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
+ }
+
+ /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+ /// for vectorization factor \p VF.
+ bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
+ return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
+ !isProfitableToScalarize(I, VF) &&
+ !isScalarAfterVectorization(I, VF);
+ }
+
+ /// Decision that was taken during cost calculation for memory instruction.
+ enum InstWidening {
+ CM_Unknown,
+ CM_Widen, // For consecutive accesses with stride +1.
+ CM_Widen_Reverse, // For consecutive accesses with stride -1.
+ CM_Interleave,
+ CM_GatherScatter,
+ CM_Scalarize
+ };
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// instruction \p I and vector width \p VF.
+ void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
+ unsigned Cost) {
+ assert(VF >= 2 && "Expected VF >=2");
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+ }
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// interleaving group \p Grp and vector width \p VF.
+ void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
+ InstWidening W, unsigned Cost) {
+ assert(VF >= 2 && "Expected VF >=2");
+ /// Broadcast this decicion to all instructions inside the group.
+ /// But the cost will be assigned to one instruction only.
+ for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+ if (auto *I = Grp->getMember(i)) {
+ if (Grp->getInsertPos() == I)
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+ else
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+ }
+ }
+ }
+
+ /// Return the cost model decision for the given instruction \p I and vector
+ /// width \p VF. Return CM_Unknown if this instruction did not pass
+ /// through the cost modeling.
+ InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+ assert(VF >= 2 && "Expected VF >=2");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return CM_GatherScatter;
+
+ std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ auto Itr = WideningDecisions.find(InstOnVF);
+ if (Itr == WideningDecisions.end())
+ return CM_Unknown;
+ return Itr->second.first;
+ }
+
+ /// Return the vectorization cost for the given instruction \p I and vector
+ /// width \p VF.
+ unsigned getWideningCost(Instruction *I, unsigned VF) {
+ assert(VF >= 2 && "Expected VF >=2");
+ std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+ "The cost is not calculated");
+ return WideningDecisions[InstOnVF].second;
+ }
+
+ /// Return True if instruction \p I is an optimizable truncate whose operand
+ /// is an induction variable. Such a truncate will be removed by adding a new
+ /// induction variable with the destination type.
+ bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+ // If the instruction is not a truncate, return false.
+ auto *Trunc = dyn_cast<TruncInst>(I);
+ if (!Trunc)
+ return false;
+
+ // Get the source and destination types of the truncate.
+ Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+ Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+ // If the truncate is free for the given types, return false. Replacing a
+ // free truncate with an induction variable would add an induction variable
+ // update instruction to each iteration of the loop. We exclude from this
+ // check the primary induction variable since it will need an update
+ // instruction regardless.
+ Value *Op = Trunc->getOperand(0);
+ if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+ return false;
+
+ // If the truncated value is not an induction variable, return false.
+ return Legal->isInductionPhi(Op);
+ }
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
+ /// Collect Uniform and Scalar values for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
+ void collectUniformsAndScalars(unsigned VF) {
+ // Do the analysis once.
+ if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ collectLoopScalars(VF);
+ }
+
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedStore(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedLoad(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine supports masked scatter operation
+ /// for the given \p DataType.
+ bool isLegalMaskedScatter(Type *DataType) {
+ return TTI.isLegalMaskedScatter(DataType);
+ }
+
+ /// Returns true if the target machine supports masked gather operation
+ /// for the given \p DataType.
+ bool isLegalMaskedGather(Type *DataType) {
+ return TTI.isLegalMaskedGather(DataType);
+ }
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getMemInstValueType(V);
+ return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ }
+
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ /// If a non-zero VF has been calculated, we check if I will be scalarized
+ /// predication for that VF.
+ bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
+
+ // Returns true if \p I is an instruction that will be predicated either
+ // through scalar predication or masked load/store or masked gather/scatter.
+ // Superset of instructions that return true for isScalarWithPredication.
+ bool isPredicatedInst(Instruction *I) {
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ // Loads and stores that need some form of masked operation are predicated
+ // instructions.
+ if (isa<LoadInst>(I) || isa<StoreInst>(I))
+ return Legal->isMaskRequired(I);
+ return isScalarWithPredication(I);
+ }
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
+ bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+ /// Returns true if \p I is a memory instruction in an interleaved-group
+ /// of memory accesses that can be vectorized with wide vector loads/stores
+ /// and shuffles.
+ bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
+ /// Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup<Instruction> *
+ getInterleavedAccessGroup(Instruction *Instr) {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
+ /// Returns true if an interleaved group requires a scalar iteration
+ /// to handle accesses with gaps, and there is nothing preventing us from
+ /// creating a scalar epilogue.
+ bool requiresScalarEpilogue() const {
+ return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
+ }
+
+ /// Returns true if a scalar epilogue is not allowed due to optsize or a
+ /// loop hint annotation.
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ }
+
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const { return FoldTailByMasking; }
+
+ bool blockNeedsPredication(BasicBlock *BB) {
+ return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+ }
+
+ /// Estimate cost of an intrinsic call instruction CI if it were vectorized
+ /// with factor VF. Return the cost of the instruction, including
+ /// scalarization overhead if it's needed.
+ unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
+
+ /// Estimate cost of a call instruction CI if it were vectorized with factor
+ /// VF. Return the cost of the instruction, including scalarization overhead
+ /// if it's needed. The flag NeedToScalarize shows if the call needs to be
+ /// scalarized -
+ /// i.e. either vector version isn't available, or is too expensive.
+ unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+
+private:
+ unsigned NumPredStores = 0;
+
+ /// \return An upper bound for the vectorization factor, larger than zero.
+ /// One is returned if vectorization should best be avoided due to cost.
+ unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+
+ /// The vectorization cost is a combination of the cost itself and a boolean
+ /// indicating whether any of the contributing operations will actually
+ /// operate on
+ /// vector values after type legalization in the backend. If this latter value
+ /// is
+ /// false, then all operations will be scalarized (i.e. no vectorization has
+ /// actually taken place).
+ using VectorizationCostTy = std::pair<unsigned, bool>;
+
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ VectorizationCostTy expectedCost(unsigned VF);
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+ /// The cost-computation logic from getInstructionCost which provides
+ /// the vector type as an output parameter.
+ unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+
+ /// Calculate vectorization cost of memory instruction \p I.
+ unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+
+ /// The cost computation for scalarized memory instruction.
+ unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+ /// The cost computation for interleaving group of memory instructions.
+ unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
+ /// The cost computation for Gather/Scatter instruction.
+ unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+
+ /// The cost computation for widening instruction \p I with consecutive
+ /// memory access.
+ unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+
+ /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+ /// Load: scalar load + broadcast.
+ /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+ /// element)
+ unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+
+ /// Estimate the overhead of scalarizing an instruction. This is a
+ /// convenience wrapper for the type-based getScalarizationOverhead API.
+ unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
+
+ /// Returns whether the instruction is a load or store and will be a emitted
+ /// as a vector operation.
+ bool isConsecutiveLoadOrStore(Instruction *I);
+
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I);
+
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction *, uint64_t> MinBWs;
+
+ /// A type representing the costs for instructions if they were to be
+ /// scalarized rather than vectorized. The entries are Instruction-Cost
+ /// pairs.
+ using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
+
+ /// A set containing all BasicBlocks that are known to present after
+ /// vectorization as a predicated block.
+ SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
+ /// Records whether it is allowed to have the original scalar loop execute at
+ /// least once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF,
+ /// or as a peel-loop to handle gaps in interleave-groups.
+ /// Under optsize and when the trip count is very small we don't allow any
+ /// iterations to execute in the scalar loop.
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+
+ /// All blocks of loop are to be masked to fold tail of scalar iterations.
+ bool FoldTailByMasking = false;
+
+ /// A map holding scalar costs for different vectorization factors. The
+ /// presence of a cost for an instruction in the mapping indicates that the
+ /// instruction will be scalarized when vectorizing with the associated
+ /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+ DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+ /// Holds the instructions known to be uniform after vectorization.
+ /// The data is collected per VF.
+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+ /// Holds the instructions known to be scalar after vectorization.
+ /// The data is collected per VF.
+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
+ /// Holds the instructions (address computations) that are forced to be
+ /// scalarized.
+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+ /// Returns the expected difference in cost from scalarizing the expression
+ /// feeding a predicated instruction \p PredInst. The instructions to
+ /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+ /// non-negative return value implies the expression will be scalarized.
+ /// Currently, only single-use chains are considered for scalarization.
+ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+ unsigned VF);
+
+ /// Collect the instructions that are uniform after vectorization. An
+ /// instruction is uniform if we represent it with a single scalar value in
+ /// the vectorized loop corresponding to each vector iteration. Examples of
+ /// uniform instructions include pointer operands of consecutive or
+ /// interleaved memory accesses. Note that although uniformity implies an
+ /// instruction will be scalar, the reverse is not true. In general, a
+ /// scalarized instruction will be represented by VF scalar values in the
+ /// vectorized loop, each corresponding to an iteration of the original
+ /// scalar loop.
+ void collectLoopUniforms(unsigned VF);
+
+ /// Collect the instructions that are scalar after vectorization. An
+ /// instruction is scalar if it is known to be uniform or will be scalarized
+ /// during vectorization. Non-uniform scalarized instructions will be
+ /// represented by VF values in the vectorized loop, each corresponding to an
+ /// iteration of the original scalar loop.
+ void collectLoopScalars(unsigned VF);
+
+ /// Keeps cost model vectorization decision and cost for instructions.
+ /// Right now it is used for memory instructions only.
+ using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
+ std::pair<InstWidening, unsigned>>;
+
+ DecisionList WideningDecisions;
+
+ /// Returns true if \p V is expected to be vectorized and it needs to be
+ /// extracted.
+ bool needsExtract(Value *V, unsigned VF) const {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+ return false;
+
+ // Assume we can vectorize V (and hence we need extraction) if the
+ // scalars are not computed yet. This can happen, because it is called
+ // via getScalarizationOverhead from setCostBasedWideningDecision, before
+ // the scalars are collected. That should be a safe assumption in most
+ // cases, because we check if the operands have vectorizable types
+ // beforehand in LoopVectorizationLegality.
+ return Scalars.find(VF) == Scalars.end() ||
+ !isScalarAfterVectorization(I, VF);
+ };
+
+ /// Returns a range containing only operands needing to be extracted.
+ SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+ unsigned VF) {
+ return SmallVector<Value *, 4>(make_filter_range(
+ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+ }
+
+public:
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+
+ /// Predicated scalar evolution analysis.
+ PredicatedScalarEvolution &PSE;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+
+ /// Vector target information.
+ const TargetTransformInfo &TTI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Demanded bits analysis.
+ DemandedBits *DB;
+
+ /// Assumption cache.
+ AssumptionCache *AC;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ const Function *TheFunction;
+
+ /// Loop Vectorize Hint.
+ const LoopVectorizeHints *Hints;
+
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo &InterleaveInfo;
+
+ /// Values to ignore in the cost model.
+ SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
+ /// Values to ignore in the cost model when VF > 1.
+ SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+};
+
+} // end namespace llvm
+
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+ OptimizationRemarkEmitter *ORE) {
+ assert(!OuterLp->empty() && "This is not an outer loop");
+ LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+ // Only outer loops with an explicit vectorization hint are supported.
+ // Unannotated outer loops are ignored.
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
+
+ Function *Fn = OuterLp->getHeader()->getParent();
+ if (!Hints.allowVectorization(Fn, OuterLp,
+ true /*VectorizeOnlyWhenForced*/)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+ return false;
+ }
+
+ if (Hints.getInterleave() > 1) {
+ // TODO: Interleave support is future work.
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+ "outer loops.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ return true;
+}
+
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE,
+ SmallVectorImpl<Loop *> &V) {
+ // Collect inner loops and outer loops without irreducible control flow. For
+ // now, only collect outer loops that have explicit vectorization hints. If we
+ // are stress testing the VPlan H-CFG construction, we collect the outermost
+ // loop of every loop nest.
+ if (L.empty() || VPlanBuildStressTest ||
+ (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(LI);
+ if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
+ V.push_back(&L);
+ // TODO: Collect inner loops inside marked outer loops in case
+ // vectorization fails for the outer loop. Do not invoke
+ // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+ // already known to be reducible. We can use an inherited attribute for
+ // that.
+ return;
+ }
+ }
+ for (Loop *InnerL : L)
+ collectSupportedLoops(*InnerL, LI, ORE, V);
+}
+
+namespace {
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public FunctionPass {
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ LoopVectorizePass Impl;
+
+ explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
+ bool VectorizeOnlyWhenForced = false)
+ : FunctionPass(ID) {
+ Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
+ Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
+ initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+ return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
+ GetLAA, *ORE, PSI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
+//===----------------------------------------------------------------------===//
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+ // We need to place the broadcast of invariant variables outside the loop,
+ // but only if it's proven safe to do so. Else, broadcast will be inside
+ // vector loop body.
+ Instruction *Instr = dyn_cast<Instruction>(V);
+ bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+ (!Instr ||
+ DT->dominates(Instr->getParent(), LoopVectorPreHeader));
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (SafeToHoist)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+}
+
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+ const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+ Value *Start = II.getStartValue();
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+ Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SteppedStart =
+ getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = II.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
+ Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+ Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(VF, Mul);
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*LoopVectorBody->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+
+ if (isa<TruncInst>(EntryVal))
+ addMetadata(LastInduction, EntryVal);
+ recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+
+ LastInduction = cast<Instruction>(addFastMathFlag(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ // Move the last step to the end of the latch block. This ensures consistent
+ // placement of all induction updates.
+ auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+ auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
+ auto *ICmp = cast<Instruction>(Br->getCondition());
+ LastInduction->moveBefore(ICmp);
+ LastInduction->setName("vec.ind.next");
+
+ VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+ VecInd->addIncoming(LastInduction, LoopVectorLatch);
+}
+
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+ return Cost->isScalarAfterVectorization(I, VF) ||
+ Cost->isProfitableToScalarize(I, VF);
+}
+
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+ if (shouldScalarizeInstruction(IV))
+ return true;
+ auto isScalarInst = [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
+ };
+ return llvm::any_of(IV->users(), isScalarInst);
+}
+
+void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
+ const InductionDescriptor &ID, const Instruction *EntryVal,
+ Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // This induction variable is not the phi from the original loop but the
+ // newly-created IV based on the proof that casted Phi is equal to the
+ // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+ // re-uses the same InductionDescriptor that original IV uses but we don't
+ // have to do any recording in this case - that is done when original IV is
+ // processed.
+ if (isa<TruncInst>(EntryVal))
+ return;
+
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (Casts.empty())
+ return;
+ // Only the first Cast instruction in the Casts vector is of interest.
+ // The rest of the Casts (if exist) have no uses outside the
+ // induction update chain itself.
+ Instruction *CastInst = *Casts.begin();
+ if (Lane < UINT_MAX)
+ VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+ else
+ VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+}
+
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+ assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+ "Primary induction variable must have an integer type");
+
+ auto II = Legal->getInductionVars()->find(IV);
+ assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
+
+ auto ID = II->second;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+
+ // The scalar value to broadcast. This will be derived from the canonical
+ // induction variable.
+ Value *ScalarIV = nullptr;
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // True if we have vectorized the induction variable.
+ auto VectorizedIV = false;
+
+ // Determine if we want a scalar version of the induction variable. This is
+ // true if the induction variable itself is not widened, or if it has at
+ // least one user in the loop that is not widened.
+ auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+
+ // Generate code for the induction step. Note that induction steps are
+ // required to be loop-invariant
+ assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
+ "Induction step should be loop invariant");
+ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ Value *Step = nullptr;
+ if (PSE.getSE()->isSCEVable(IV->getType())) {
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+ LoopVectorPreHeader->getTerminator());
+ } else {
+ Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+ }
+
+ // Try to create a new independent vector induction variable. If we can't
+ // create the phi node, we will splat the scalar induction variable in each
+ // loop iteration.
+ if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
+ createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ VectorizedIV = true;
+ }
+
+ // If we haven't yet vectorized the induction variable, or if we will create
+ // a scalar one, we need to define the scalar induction variable and step
+ // values. If we were given a truncation type, truncate the canonical
+ // induction variable and step. Otherwise, derive these values from the
+ // induction descriptor.
+ if (!VectorizedIV || NeedsScalarIV) {
+ ScalarIV = Induction;
+ if (IV != OldInduction) {
+ ScalarIV = IV->getType()->isIntegerTy()
+ ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+ : Builder.CreateCast(Instruction::SIToFP, Induction,
+ IV->getType());
+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+ ScalarIV->setName("offset.idx");
+ }
+ if (Trunc) {
+ auto *TruncType = cast<IntegerType>(Trunc->getType());
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+ Step = Builder.CreateTrunc(Step, TruncType);
+ }
+ }
+
+ // If we haven't yet vectorized the induction variable, splat the scalar
+ // induction variable, and build the necessary step vectors.
+ // TODO: Don't do it unless the vectorized IV is really required.
+ if (!VectorizedIV) {
+ Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *EntryPart =
+ getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+ VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+ if (Trunc)
+ addMetadata(EntryPart, Trunc);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
+ }
+ }
+
+ // If an induction variable is only used for counting loop iterations or
+ // calculating addresses, it doesn't need to be widened. Create scalar steps
+ // that can be used by instructions we will later scalarize. Note that the
+ // addition of the scalar steps will not increase the number of instructions
+ // in the loop in the common case prior to InstCombine. We will be trading
+ // one vector extract for each scalar step.
+ if (NeedsScalarIV)
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+}
+
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
+ // Create and check the types.
+ assert(Val->getType()->isVectorTy() && "Must be a vector");
+ int VLen = Val->getType()->getVectorNumElements();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ if (STy->isIntegerTy()) {
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+ assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ Step = Builder.CreateMul(Cv, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setFast();
+
+ Value *MulOp = Builder.CreateFMul(Cv, Step);
+ if (isa<Instruction>(MulOp))
+ // Have to check, MulOp may be a constant
+ cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+ Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+ return BOp;
+}
+
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+ Instruction *EntryVal,
+ const InductionDescriptor &ID) {
+ // We shouldn't have to build scalar steps if we aren't vectorizing.
+ assert(VF > 1 && "VF should be greater than one");
+
+ // Get the value type and ensure it and the step have the same integer type.
+ Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+ assert(ScalarIVTy == Step->getType() &&
+ "Val and Step should have the same type");
+
+ // We build scalar steps for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (ScalarIVTy->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If EntryVal is uniform, we only need to generate the first
+ // lane. Otherwise, we generate all VF values.
+ unsigned Lanes =
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
+ : VF;
+ // Compute the scalar steps and save the results in VectorLoopValueMap.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+ auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+ auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
+ VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
+ }
+ }
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ // If we have a stride that is replaced by one, do it here. Defer this for
+ // the VPlan-native path until we start running Legal checks in that path.
+ if (!EnableVPlanNativePath && Legal->hasStride(V))
+ V = ConstantInt::get(V->getType(), 1);
+
+ // If we have a vector mapped to this value, return it.
+ if (VectorLoopValueMap.hasVectorValue(V, Part))
+ return VectorLoopValueMap.getVectorValue(V, Part);
+
+ // If the value has not been vectorized, check if it has been scalarized
+ // instead. If it has been scalarized, and we actually need the value in
+ // vector form, we will construct the vector values on demand.
+ if (VectorLoopValueMap.hasAnyScalarValue(V)) {
+ Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
+
+ // If we've scalarized a value, that value should be an instruction.
+ auto *I = cast<Instruction>(V);
+
+ // If we aren't vectorizing, we can just copy the scalar map values over to
+ // the vector map.
+ if (VF == 1) {
+ VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+ return ScalarValue;
+ }
+
+ // Get the last scalar instruction we generated for V and Part. If the value
+ // is known to be uniform after vectorization, this corresponds to lane zero
+ // of the Part unroll iteration. Otherwise, the last instruction is the one
+ // we created for the last vector lane of the Part unroll iteration.
+ unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+ auto *LastInst = cast<Instruction>(
+ VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
+
+ // Set the insert point after the last scalarized instruction. This ensures
+ // the insertelement sequence will directly follow the scalar definitions.
+ auto OldIP = Builder.saveIP();
+ auto NewIP = std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using insertelement
+ // instructions. Since the resulting vectors are stored in
+ // VectorLoopValueMap, we will only generate the insertelements once.
+ Value *VectorValue = nullptr;
+ if (Cost->isUniformAfterVectorization(I, VF)) {
+ VectorValue = getBroadcastInstrs(ScalarValue);
+ VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
+ } else {
+ // Initialize packing with insertelements to start from undef.
+ Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Undef);
+ for (unsigned Lane = 0; Lane < VF; ++Lane)
+ packScalarIntoVectorValue(V, {Part, Lane});
+ VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
+ }
+ Builder.restoreIP(OldIP);
+ return VectorValue;
+ }
+
+ // If this scalar is unknown, assume that it is a constant or that it is
+ // loop invariant. Broadcast V and save the value for future uses.
+ Value *B = getBroadcastInstrs(V);
+ VectorLoopValueMap.setVectorValue(V, Part, B);
+ return B;
+}
+
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) {
+ // If the value is not an instruction contained in the loop, it should
+ // already be scalar.
+ if (OrigLoop->isLoopInvariant(V))
+ return V;
+
+ assert(Instance.Lane > 0
+ ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+ : true && "Uniform values only have lane zero");
+
+ // If the value from the original loop has not been vectorized, it is
+ // represented by UF x VF scalar values in the new loop. Return the requested
+ // scalar value.
+ if (VectorLoopValueMap.hasScalarValue(V, Instance))
+ return VectorLoopValueMap.getScalarValue(V, Instance);
+
+ // If the value has not been scalarized, get its entry in VectorLoopValueMap
+ // for the given unroll part. If this entry is not a vector type (i.e., the
+ // vectorization factor is one), there is no need to generate an
+ // extractelement instruction.
+ auto *U = getOrCreateVectorValue(V, Instance.Part);
+ if (!U->getType()->isVectorTy()) {
+ assert(VF == 1 && "Value not scalarized has non-vector type");
+ return U;
+ }
+
+ // Otherwise, the value from the original loop has been vectorized and is
+ // represented by UF vector values. Extract and return the requested scalar
+ // value from the appropriate vector lane.
+ return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+ Value *V, const VPIteration &Instance) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+ Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+ Builder.getInt32(Instance.Lane));
+ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+ assert(Vec->getType()->isVectorTy() && "Invalid type");
+ SmallVector<Constant *, 8> ShuffleMask;
+ for (unsigned i = 0; i < VF; ++i)
+ ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+ return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+ ConstantVector::get(ShuffleMask),
+ "reverse");
+}
+
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+ return EnableMaskedInterleavedMemAccesses;
+
+ return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// R = Pic[i]; // Member of index 0
+// G = Pic[i+1]; // Member of index 1
+// B = Pic[i+2]; // Member of index 2
+// ... // do something to R, G, B
+// }
+// To:
+// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
+// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
+// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
+// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
+//
+// Or translate following interleaved store group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// ... do something to R, G, B
+// Pic[i] = R; // Member of index 0
+// Pic[i+1] = G; // Member of index 1
+// Pic[i+2] = B; // Member of index 2
+// }
+// To:
+// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
+// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask) {
+ const InterleaveGroup<Instruction> *Group =
+ Cost->getInterleavedAccessGroup(Instr);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ // Skip if current instruction is not the insert position.
+ if (Instr != Group->getInsertPos())
+ return;
+
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
+ Value *Ptr = getLoadStorePointerOperand(Instr);
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getMemInstValueType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+ Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
+
+ // Prepare for the new pointers.
+ setDebugLocFromInst(Builder, Ptr);
+ SmallVector<Value *, 2> NewPtrs;
+ unsigned Index = Group->getIndex(Instr);
+
+ VectorParts Mask;
+ bool IsMaskForCondRequired = BlockInMask;
+ if (IsMaskForCondRequired) {
+ Mask = *BlockInMask;
+ // TODO: extend the masked interleaved-group support to reversed access.
+ assert(!Group->isReverse() && "Reversed masked interleave-group "
+ "not supported.");
+ }
+
+ // If the group is reverse, adjust the index to refer to the last vector lane
+ // instead of the first. We adjust the index from the first vector lane,
+ // rather than directly getting the pointer for lane VF - 1, because the
+ // pointer operand of the interleaved access is supposed to be uniform. For
+ // uniform instructions, we're only required to generate a value for the
+ // first vector lane in each unroll iteration.
+ if (Group->isReverse())
+ Index += (VF - 1) * Group->getFactor();
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
+
+ // Notice current instruction could be any index. Need to adjust the address
+ // to the member of index 0.
+ //
+ // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
+ // b = A[i]; // Member of index 0
+ // Current pointer is pointed to A[i+1], adjust it to A[i].
+ //
+ // E.g. A[i+1] = a; // Member of index 1
+ // A[i] = b; // Member of index 0
+ // A[i+2] = c; // Member of index 2 (Current instruction)
+ // Current pointer is pointed to A[i+2], adjust it to A[i].
+ NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
+ if (InBounds)
+ cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
+
+ // Cast to the vector pointer type.
+ NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+ }
+
+ setDebugLocFromInst(Builder, Instr);
+ Value *UndefVec = UndefValue::get(VecTy);
+
+ Value *MaskForGaps = nullptr;
+ if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+ MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ // For each unroll part, create a wide load for the group.
+ SmallVector<Value *, 2> NewLoads;
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Instruction *NewLoad;
+ if (IsMaskForCondRequired || MaskForGaps) {
+ assert(useMaskedInterleavedAccesses(*TTI) &&
+ "masked interleaved groups are not allowed.");
+ Value *GroupMask = MaskForGaps;
+ if (IsMaskForCondRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ GroupMask = MaskForGaps
+ ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ }
+ NewLoad =
+ Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+ GroupMask, UndefVec, "wide.masked.vec");
+ }
+ else
+ NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
+ Group->getAlignment(), "wide.vec");
+ Group->addMetadata(NewLoad);
+ NewLoads.push_back(NewLoad);
+ }
+
+ // For each member in the group, shuffle out the appropriate data from the
+ // wide loads.
+ for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Value *StridedVec = Builder.CreateShuffleVector(
+ NewLoads[Part], UndefVec, StrideMask, "strided.vec");
+
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
+ }
+
+ if (Group->isReverse())
+ StridedVec = reverseVector(StridedVec);
+
+ VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
+ }
+ }
+ return;
+ }
+
+ // The sub vector type for current instruction.
+ VectorType *SubVT = VectorType::get(ScalarTy, VF);
+
+ // Vectorize the interleaved store group.
+ for (unsigned Part = 0; Part < UF; Part++) {
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ // Interleaved store group doesn't allow a gap, so each index has a member
+ Instruction *Member = Group->getMember(i);
+ assert(Member && "Fail to get a member from an interleaved store group");
+
+ Value *StoredVec = getOrCreateVectorValue(
+ cast<StoreInst>(Member)->getValueOperand(), Part);
+ if (Group->isReverse())
+ StoredVec = reverseVector(StoredVec);
+
+ // If this member has different type, cast it to a unified type.
+
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ }
+
+ // Concatenate all vectors into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, StoredVecs);
+
+ // Interleave the elements in the wide vector.
+ Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
+ Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+ "interleaved.vec");
+
+ Instruction *NewStoreInstr;
+ if (IsMaskForCondRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ NewStoreInstr = Builder.CreateMaskedStore(
+ IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+ }
+ else
+ NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
+ Group->getAlignment());
+
+ Group->addMetadata(NewStoreInstr);
+ }
+}
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask) {
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+
+ LoopVectorizationCostModel::InstWidening Decision =
+ Cost->getWideningDecision(Instr, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point");
+ if (Decision == LoopVectorizationCostModel::CM_Interleave)
+ return vectorizeInterleaveGroup(Instr);
+
+ Type *ScalarDataTy = getMemInstValueType(Instr);
+ Type *DataTy = VectorType::get(ScalarDataTy, VF);
+ Value *Ptr = getLoadStorePointerOperand(Instr);
+ // An alignment of 0 means target abi alignment. We need to use the scalar's
+ // target abi alignment in such a case.
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
+ const Align Alignment =
+ DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
+ unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
+
+ // Determine if the pointer operand of the access is either consecutive or
+ // reverse consecutive.
+ bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+ bool ConsecutiveStride =
+ Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
+ bool CreateGatherScatter =
+ (Decision == LoopVectorizationCostModel::CM_GatherScatter);
+
+ // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+ // gather/scatter. Otherwise Decision should have been to Scalarize.
+ assert((ConsecutiveStride || CreateGatherScatter) &&
+ "The instruction should be scalarized");
+
+ // Handle consecutive loads/stores.
+ if (ConsecutiveStride)
+ Ptr = getOrCreateScalarValue(Ptr, {0, 0});
+
+ VectorParts Mask;
+ bool isMaskRequired = BlockInMask;
+ if (isMaskRequired)
+ Mask = *BlockInMask;
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(Instr)->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
+ } else {
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
+ // Handle Stores:
+ if (SI) {
+ setDebugLocFromInst(Builder, SI);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Instruction *NewSI = nullptr;
+ Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
+ Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
+ Alignment.value(), MaskPart);
+ } else {
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal = reverseVector(StoredVal);
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't call resetVectorValue(StoredVal).
+ }
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
+ if (isMaskRequired)
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
+ Alignment.value(), Mask[Part]);
+ else
+ NewSI =
+ Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
+ }
+ addMetadata(NewSI, SI);
+ }
+ return;
+ }
+
+ // Handle loads.
+ assert(LI && "Must have a load instruction");
+ setDebugLocFromInst(Builder, LI);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *NewLI;
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
+ Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+ NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
+ nullptr, "wide.masked.gather");
+ addMetadata(NewLI, LI);
+ } else {
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
+ if (isMaskRequired)
+ NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
+ UndefValue::get(DataTy),
+ "wide.masked.load");
+ else
+ NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
+ "wide.load");
+
+ // Add metadata to the load, but setVectorValue to the reverse shuffle.
+ addMetadata(NewLI, LI);
+ if (Reverse)
+ NewLI = reverseVector(NewLI);
+ }
+ VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
+ }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ const VPIteration &Instance,
+ bool IfPredicateInstr) {
+ assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+
+ setDebugLocFromInst(Builder, Instr);
+
+ // Does this instruction return a value ?
+ bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
+
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+ Cloned->setOperand(op, NewOp);
+ }
+ addNewMetadata(Cloned, Instr);
+
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
+
+ // Add the cloned scalar to the scalar map entry.
+ VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
+
+ // If we just cloned a new assumption, add it the assumption cache.
+ if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+
+ // End if-block.
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(Cloned);
+}
+
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+ Value *End, Value *Step,
+ Instruction *DL) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ // As we're just creating this loop, it's possible no latch exists
+ // yet. If so, use the header as this will be a single block loop.
+ if (!Latch)
+ Latch = Header;
+
+ IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+ Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
+ setDebugLocFromInst(Builder, OldInst);
+ auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+ Builder.SetInsertPoint(Latch->getTerminator());
+ setDebugLocFromInst(Builder, OldInst);
+
+ // Create i+1 and fill the PHINode.
+ Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+ Induction->addIncoming(Start, L->getLoopPreheader());
+ Induction->addIncoming(Next, Latch);
+ // Create the compare.
+ Value *ICmp = Builder.CreateICmpEQ(Next, End);
+ Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+
+ // Now we have two terminators. Remove the old one from the block.
+ Latch->getTerminator()->eraseFromParent();
+
+ return Induction;
+}
+
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+ if (TripCount)
+ return TripCount;
+
+ assert(L && "Create Trip Count for null loop.");
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ // Find the loop boundaries.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+ "Invalid loop count");
+
+ Type *IdxTy = Legal->getWidestInductionType();
+ assert(IdxTy && "No type for induction");
+
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+ IdxTy->getPrimitiveSizeInBits())
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+ BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+ // Get the total trip count from the count by adding 1.
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Expand the trip count and place the new instructions in the preheader.
+ // Notice that the pre-header does not change, only the loop body.
+ SCEVExpander Exp(*SE, DL, "induction");
+
+ // Count holds the overall loop count (N).
+ TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ L->getLoopPreheader()->getTerminator());
+
+ if (TripCount->getType()->isPointerTy())
+ TripCount =
+ CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
+ L->getLoopPreheader()->getTerminator());
+
+ return TripCount;
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+ if (VectorTripCount)
+ return VectorTripCount;
+
+ Value *TC = getOrCreateTripCount(L);
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+ Type *Ty = TC->getType();
+ Constant *Step = ConstantInt::get(Ty, VF * UF);
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ if (Cost->foldTailByMasking()) {
+ assert(isPowerOf2_32(VF * UF) &&
+ "VF*UF must be a power of 2 when folding tail by masking");
+ TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+ }
+
+ // Now we need to generate the expression for the part of the loop that the
+ // vectorized body will execute. This is equal to N - (N % Step) if scalar
+ // iterations are not required for correctness, or N - Step, otherwise. Step
+ // is equal to the vectorization factor (number of SIMD elements) times the
+ // unroll factor (number of SIMD instructions).
+ Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+ // If there is a non-reversed interleaved group that may speculatively access
+ // memory out-of-bounds, we need to ensure that there will be at least one
+ // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+ // the trip count, we set the remainder to be equal to the step. If the step
+ // does not evenly divide the trip count, no adjustment is necessary since
+ // there will already be scalar iterations. Note that the minimum iterations
+ // check ensures that N >= Step.
+ if (VF > 1 && Cost->requiresScalarEpilogue()) {
+ auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+ R = Builder.CreateSelect(IsZero, Step, R);
+ }
+
+ VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+ return VectorTripCount;
+}
+
+Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL) {
+ // Verify that V is a vector type with same number of elements as DstVTy.
+ unsigned VF = DstVTy->getNumElements();
+ VectorType *SrcVecTy = cast<VectorType>(V->getType());
+ assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ Type *SrcElemTy = SrcVecTy->getElementType();
+ Type *DstElemTy = DstVTy->getElementType();
+ assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+ "Vector elements must have same size");
+
+ // Do a direct cast if element types are castable.
+ if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+ return Builder.CreateBitOrPointerCast(V, DstVTy);
+ }
+ // V cannot be directly casted to desired vector type.
+ // May happen when V is a floating point vector but DstVTy is a vector of
+ // pointers or vice-versa. Handle this using a two-step bitcast using an
+ // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+ assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+ "Only one type should be a pointer type");
+ assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+ "Only one type should be a floating point type");
+ Type *IntTy =
+ IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+ VectorType *VecIntTy = VectorType::get(IntTy, VF);
+ Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+ return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(L);
+ BasicBlock *BB = L->getLoopPreheader();
+ IRBuilder<> Builder(BB->getTerminator());
+
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
+
+ // If tail is to be folded, vector loop takes care of all iterations.
+ Value *CheckMinIters = Builder.getFalse();
+ if (!Cost->foldTailByMasking())
+ CheckMinIters = Builder.CreateICmp(
+ P, Count, ConstantInt::get(Count->getType(), VF * UF),
+ "min.iters.check");
+
+ BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ // Update dominator tree immediately if the generated block is a
+ // LoopBypassBlock because SCEV expansions to generate loop bypass
+ // checks may query it before the current function is finished.
+ DT->addNewBlock(NewBB, BB);
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, CheckMinIters));
+ LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code to check that the SCEV assumptions that we made.
+ // We want the new basic block to start at the first instruction in a
+ // sequence of instructions that form a check.
+ SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+ "scev.check");
+ Value *SCEVCheck =
+ Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+
+ if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+ if (C->isZero())
+ return;
+
+ assert(!BB->getParent()->hasOptSize() &&
+ "Cannot SCEV check stride or overflow when optimizing for size");
+
+ // Create a new block containing the stride check.
+ BB->setName("vector.scevcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ // Update dominator tree immediately if the generated block is a
+ // LoopBypassBlock because SCEV expansions to generate loop bypass
+ // checks may query it before the current function is finished.
+ DT->addNewBlock(NewBB, BB);
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, SCEVCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+ // VPlan-native path does not do any analysis for runtime checks currently.
+ if (EnableVPlanNativePath)
+ return;
+
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+ if (!MemRuntimeCheck)
+ return;
+
+ if (BB->getParent()->hasOptSize()) {
+ assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ "Cannot emit memory checks when optimizing for size, unless forced "
+ "to vectorize.");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+ L->getStartLoc(), L->getHeader())
+ << "Code-size may be reduced by not forcing "
+ "vectorization, or by source-code modifications "
+ "eliminating the need for runtime checks "
+ "(e.g., adding 'restrict').";
+ });
+ }
+
+ // Create a new block containing the memory check.
+ BB->setName("vector.memcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ // Update dominator tree immediately if the generated block is a
+ // LoopBypassBlock because SCEV expansions to generate loop bypass
+ // checks may query it before the current function is finished.
+ DT->addNewBlock(NewBB, BB);
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+
+ // We currently don't use LoopVersioning for the actual loop cloning but we
+ // still use it to add the noalias metadata.
+ LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+ PSE.getSE());
+ LVer->prepareNoAliasMetadata();
+}
+
+Value *InnerLoopVectorizer::emitTransformedIndex(
+ IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
+ const InductionDescriptor &ID) const {
+
+ SCEVExpander Exp(*SE, DL, "induction");
+ auto Step = ID.getStep();
+ auto StartValue = ID.getStartValue();
+ assert(Index->getType() == Step->getType() &&
+ "Index type does not match StepValue type");
+
+ // Note: the IR at this point is broken. We cannot use SE to create any new
+ // SCEV and then expand it, hoping that SCEV's simplification will give us
+ // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+ // lead to various SCEV crashes. So all we can do is to use builder and rely
+ // on InstCombine for future simplifications. Here we handle some trivial
+ // cases only.
+ auto CreateAdd = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isZero())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isZero())
+ return X;
+ return B.CreateAdd(X, Y);
+ };
+
+ auto CreateMul = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isOne())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isOne())
+ return X;
+ return B.CreateMul(X, Y);
+ };
+
+ switch (ID.getKind()) {
+ case InductionDescriptor::IK_IntInduction: {
+ assert(Index->getType() == StartValue->getType() &&
+ "Index type does not match StartValue type");
+ if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
+ return B.CreateSub(StartValue, Index);
+ auto *Offset = CreateMul(
+ Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
+ return CreateAdd(StartValue, Offset);
+ }
+ case InductionDescriptor::IK_PtrInduction: {
+ assert(isa<SCEVConstant>(Step) &&
+ "Expected constant step for pointer induction");
+ return B.CreateGEP(
+ StartValue->getType()->getPointerElementType(), StartValue,
+ CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
+ &*B.GetInsertPoint())));
+ }
+ case InductionDescriptor::IK_FpInduction: {
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ auto InductionBinOp = ID.getInductionBinOp();
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setFast();
+
+ Value *MulExp = B.CreateFMul(StepValue, Index);
+ if (isa<Instruction>(MulExp))
+ // We have to check, the MulExp may be a constant.
+ cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+ Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+ return BOp;
+ }
+ case InductionDescriptor::IK_NoInduction:
+ return nullptr;
+ }
+ llvm_unreachable("invalid enum");
+}
+
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+ /*
+ In this function we generate a new loop. The new loop will contain
+ the vectorized instructions while the old loop will continue to run the
+ scalar remainder.
+
+ [ ] <-- loop iteration number check.
+ / |
+ / v
+ | [ ] <-- vector loop bypass (may consist of multiple blocks).
+ | / |
+ | / v
+ || [ ] <-- vector pre header.
+ |/ |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ | v
+ | -[ ] <--- middle-block.
+ | / |
+ | / v
+ -|- >[ ] <--- new preheader.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- old scalar loop to handle remainder.
+ \ |
+ \ v
+ >[ ] <-- exit block.
+ ...
+ */
+
+ BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+ BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+ BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+ assert(VectorPH && "Invalid loop structure");
+ assert(ExitBlock && "Must have an exit block");
+
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
+ //
+ // We try to obtain an induction variable from the original loop as hard
+ // as possible. However if we don't find one that:
+ // - is an integer
+ // - counts from zero, stepping by one
+ // - is the size of the widest induction variable type
+ // then we create a new one.
+ OldInduction = Legal->getPrimaryInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+
+ // Split the single block loop into the two loop structure described above.
+ BasicBlock *VecBody =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+ BasicBlock *MiddleBlock =
+ VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+ BasicBlock *ScalarPH =
+ MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+ // Create and register the new vector loop.
+ Loop *Lp = LI->AllocateLoop();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks
+ // before calling any utilities such as SCEV that require valid LoopInfo.
+ if (ParentLoop) {
+ ParentLoop->addChildLoop(Lp);
+ ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
+ ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
+ } else {
+ LI->addTopLevelLoop(Lp);
+ }
+ Lp->addBasicBlockToLoop(VecBody, *LI);
+
+ // Find the loop boundaries.
+ Value *Count = getOrCreateTripCount(Lp);
+
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop. This check also covers the case where the
+ // backedge-taken count is uint##_max: adding one to it will overflow leading
+ // to an incorrect trip count of zero. In this (rare) case we will also jump
+ // to the scalar loop.
+ emitMinimumIterationCountCheck(Lp, ScalarPH);
+
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ emitSCEVChecks(Lp, ScalarPH);
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ emitMemRuntimeChecks(Lp, ScalarPH);
+
+ // Generate the induction variable.
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original
+ // start value.
+
+ // This variable saves the new starting index for the scalar loop. It is used
+ // to test if there are any tail iterations left once the vector loop has
+ // completed.
+ LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+ for (auto &InductionEntry : *List) {
+ PHINode *OrigPhi = InductionEntry.first;
+ InductionDescriptor II = InductionEntry.second;
+
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal = PHINode::Create(
+ OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ Value *&EndValue = IVEndValues[OrigPhi];
+ if (OrigPhi == OldInduction) {
+ // We know what the end value is.
+ EndValue = CountRoundDown;
+ } else {
+ IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
+ Type *StepType = II.getStep()->getType();
+ Instruction::CastOps CastOp =
+ CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+ Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+ const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValue->setName("ind.end");
+ }
+
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ BCResumeVal->addIncoming(EndValue, MiddleBlock);
+
+ // Fix the scalar body counter (PHI node).
+ // The old induction's phi node in the scalar body needs the truncated
+ // value.
+ for (BasicBlock *BB : LoopBypassBlocks)
+ BCResumeVal->addIncoming(II.getStartValue(), BB);
+ OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
+ }
+
+ // We need the OrigLoop (scalar loop part) latch terminator to help
+ // produce correct debug info for the middle block BB instructions.
+ // The legality check stage guarantees that the loop will have a single
+ // latch.
+ assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
+ "Scalar loop latch terminator isn't a branch");
+ BranchInst *ScalarLatchBr =
+ cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
+
+ // Add a check in the middle block to see if we have completed
+ // all of the iterations in the first vector loop.
+ // If (N - N%VF) == N, then we *don't* need to run the remainder.
+ // If tail is to be folded, we know we don't need to run the remainder.
+ Value *CmpN = Builder.getTrue();
+ if (!Cost->foldTailByMasking()) {
+ CmpN =
+ CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+
+ // Here we use the same DebugLoc as the scalar loop latch branch instead
+ // of the corresponding compare because they may have ended up with
+ // different line numbers and we want to avoid awkward line stepping while
+ // debugging. Eg. if the compare has got a line number inside the loop.
+ cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
+ }
+
+ BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
+ BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
+ ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
+
+ // Get ready to start creating new instructions into the vectorized body.
+ Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
+
+ // Save the state.
+ LoopVectorPreHeader = Lp->getLoopPreheader();
+ LoopScalarPreHeader = ScalarPH;
+ LoopMiddleBlock = MiddleBlock;
+ LoopExitBlock = ExitBlock;
+ LoopVectorBody = VecBody;
+ LoopScalarBody = OldBasicBlock;
+
+ Optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+ if (VectorizedLoopID.hasValue()) {
+ Lp->setLoopID(VectorizedLoopID.getValue());
+
+ // Do not setAlreadyVectorized if loop attributes have been defined
+ // explicitly.
+ return LoopVectorPreHeader;
+ }
+
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
+ Lp->setLoopID(LID);
+
+ LoopVectorizeHints Hints(Lp, true, *ORE);
+ Hints.setAlreadyVectorized();
+
+ return LoopVectorPreHeader;
+}
+
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+ const InductionDescriptor &II,
+ Value *CountRoundDown, Value *EndValue,
+ BasicBlock *MiddleBlock) {
+ // There are two kinds of external IV usages - those that use the value
+ // computed in the last iteration (the PHI) and those that use the penultimate
+ // value (the value that feeds into the phi from the loop latch).
+ // We allow both, but they, obviously, have different values.
+
+ assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+
+ DenseMap<Value *, Value *> MissingVals;
+
+ // An external user of the last iteration's value should see the value that
+ // the remainder loop uses to initialize its own IV.
+ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+ for (User *U : PostInc->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (!OrigLoop->contains(UI)) {
+ assert(isa<PHINode>(UI) && "Expected LCSSA form");
+ MissingVals[UI] = EndValue;
+ }
+ }
+
+ // An external user of the penultimate value need to see EndValue - Step.
+ // The simplest way to get this is to recompute it from the constituent SCEVs,
+ // that is Start + (Step * (CRD - 1)).
+ for (User *U : OrigPhi->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (!OrigLoop->contains(UI)) {
+ const DataLayout &DL =
+ OrigLoop->getHeader()->getModule()->getDataLayout();
+ assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+ IRBuilder<> B(MiddleBlock->getTerminator());
+ Value *CountMinusOne = B.CreateSub(
+ CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+ Value *CMO =
+ !II.getStep()->getType()->isIntegerTy()
+ ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
+ II.getStep()->getType())
+ : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
+ CMO->setName("cast.cmo");
+ Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+ Escape->setName("ind.escape");
+ MissingVals[UI] = Escape;
+ }
+ }
+
+ for (auto &I : MissingVals) {
+ PHINode *PHI = cast<PHINode>(I.first);
+ // One corner case we have to handle is two IVs "chasing" each-other,
+ // that is %IV2 = phi [...], [ %IV1, %latch ]
+ // In this case, if IV1 has an external use, we need to avoid adding both
+ // "last value of IV1" and "penultimate value of IV2". So, verify that we
+ // don't already have an incoming value for the middle block.
+ if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+ PHI->addIncoming(I.second, MiddleBlock);
+ }
+}
+
+namespace {
+
+struct CSEDenseMapInfo {
+ static bool canHandle(const Instruction *I) {
+ return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+ }
+
+ static inline Instruction *getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline Instruction *getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(const Instruction *I) {
+ assert(canHandle(I) && "Unknown instruction!");
+ return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+ I->value_op_end()));
+ }
+
+ static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
+ if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+ LHS == getTombstoneKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+};
+
+} // end anonymous namespace
+
+///Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+ // Perform simple cse.
+ SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = &*I++;
+
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+
+ CSEMap[In] = In;
+ }
+}
+
+unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
+ unsigned VF,
+ bool &NeedToScalarize) {
+ Function *F = CI->getCalledFunction();
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ for (auto &ArgOp : CI->arg_operands())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Estimate cost of scalarized vector call. The source operands are assumed
+ // to be vectors, so we need to extract individual elements from there,
+ // execute VF scalar calls, and then gather the result into the vector return
+ // value.
+ unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+ if (VF == 1)
+ return ScalarCallCost;
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (Type *ScalarTy : ScalarTys)
+ Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
+
+ unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+
+ // If we can't emit a vector call for this function, then the currently found
+ // cost is the cost we need to return.
+ NeedToScalarize = true;
+ if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+ return Cost;
+
+ // If the corresponding vector cost is cheaper, return its cost.
+ unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+ if (VectorCallCost < Cost) {
+ NeedToScalarize = false;
+ return VectorCallCost;
+ }
+ return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+ unsigned VF) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Expected intrinsic call!");
+
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+ FMF = FPMO->getFastMathFlags();
+
+ SmallVector<Value *, 4> Operands(CI->arg_operands());
+ return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+}
+
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+ auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+ auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+ auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+ auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+ // For every instruction `I` in MinBWs, truncate the operands, create a
+ // truncated version of `I` and reextend its result. InstCombine runs
+ // later and will remove any ext/trunc pairs.
+ SmallPtrSet<Value *, 4> Erased;
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ continue;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *I = getOrCreateVectorValue(KV.first, Part);
+ if (Erased.find(I) != Erased.end() || I->use_empty() ||
+ !isa<Instruction>(I))
+ continue;
+ Type *OriginalTy = I->getType();
+ Type *ScalarTruncatedTy =
+ IntegerType::get(OriginalTy->getContext(), KV.second);
+ Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
+ OriginalTy->getVectorNumElements());
+ if (TruncatedTy == OriginalTy)
+ continue;
+
+ IRBuilder<> B(cast<Instruction>(I));
+ auto ShrinkOperand = [&](Value *V) -> Value * {
+ if (auto *ZI = dyn_cast<ZExtInst>(V))
+ if (ZI->getSrcTy() == TruncatedTy)
+ return ZI->getOperand(0);
+ return B.CreateZExtOrTrunc(V, TruncatedTy);
+ };
+
+ // The actual instruction modification depends on the instruction type,
+ // unfortunately.
+ Value *NewI = nullptr;
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
+ ShrinkOperand(BO->getOperand(1)));
+
+ // Any wrapping introduced by shrinking this operation shouldn't be
+ // considered undefined behavior. So, we can't unconditionally copy
+ // arithmetic wrapping flags to NewI.
+ cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
+ } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
+ NewI =
+ B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
+ ShrinkOperand(CI->getOperand(1)));
+ } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+ NewI = B.CreateSelect(SI->getCondition(),
+ ShrinkOperand(SI->getTrueValue()),
+ ShrinkOperand(SI->getFalseValue()));
+ } else if (auto *CI = dyn_cast<CastInst>(I)) {
+ switch (CI->getOpcode()) {
+ default:
+ llvm_unreachable("Unhandled cast!");
+ case Instruction::Trunc:
+ NewI = ShrinkOperand(CI->getOperand(0));
+ break;
+ case Instruction::SExt:
+ NewI = B.CreateSExtOrTrunc(
+ CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy, TruncatedTy));
+ break;
+ case Instruction::ZExt:
+ NewI = B.CreateZExtOrTrunc(
+ CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy, TruncatedTy));
+ break;
+ }
+ } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+ auto *O0 = B.CreateZExtOrTrunc(
+ SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
+ auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+ auto *O1 = B.CreateZExtOrTrunc(
+ SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
+
+ NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+ } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
+ // Don't do anything with the operands, just extend the result.
+ continue;
+ } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+ auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
+ auto *O0 = B.CreateZExtOrTrunc(
+ IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+ auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
+ NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+ auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
+ auto *O0 = B.CreateZExtOrTrunc(
+ EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+ NewI = B.CreateExtractElement(O0, EE->getOperand(2));
+ } else {
+ // If we don't know what to do, be conservative and don't do anything.
+ continue;
+ }
+
+ // Lastly, extend the result.
+ NewI->takeName(cast<Instruction>(I));
+ Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+ I->replaceAllUsesWith(Res);
+ cast<Instruction>(I)->eraseFromParent();
+ Erased.insert(I);
+ VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
+ }
+ }
+
+ // We'll have created a bunch of ZExts that are now parentless. Clean up.
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ continue;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *I = getOrCreateVectorValue(KV.first, Part);
+ ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+ if (Inst && Inst->use_empty()) {
+ Value *NewI = Inst->getOperand(0);
+ Inst->eraseFromParent();
+ VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
+ }
+ }
+ }
+}
+
+void InnerLoopVectorizer::fixVectorizedLoop() {
+ // Insert truncates and extends for any truncated instructions as hints to
+ // InstCombine.
+ if (VF > 1)
+ truncateToMinimalBitwidths();
+
+ // Fix widened non-induction PHIs by setting up the PHI operands.
+ if (OrigPHIsToFix.size()) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected non-induction PHIs for fixup in non VPlan-native path");
+ fixNonInductionPHIs();
+ }
+
+ // At this point every instruction in the original loop is widened to a
+ // vector form. Now we need to fix the recurrences in the loop. These PHI
+ // nodes are currently empty because we did not want to introduce cycles.
+ // This is the second stage of vectorizing recurrences.
+ fixCrossIterationPHIs();
+
+ // Update the dominator tree.
+ //
+ // FIXME: After creating the structure of the new loop, the dominator tree is
+ // no longer up-to-date, and it remains that way until we update it
+ // here. An out-of-date dominator tree is problematic for SCEV,
+ // because SCEVExpander uses it to guide code generation. The
+ // vectorizer use SCEVExpanders in several places. Instead, we should
+ // keep the dominator tree up-to-date as we go.
+ updateAnalysis();
+
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : *Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs();
+ for (Instruction *PI : PredicatedInstructions)
+ sinkScalarOperands(&*PI);
+
+ // Remove redundant induction instructions.
+ cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+ // In order to support recurrences we need to be able to vectorize Phi nodes.
+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+ // stage #2: We now need to fix the recurrences by adding incoming edges to
+ // the currently empty PHI nodes. At this point every instruction in the
+ // original loop is widened to a vector form so we can use them to construct
+ // the incoming edges.
+ for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
+ // Handle first-order recurrences and reductions that need to be fixed.
+ if (Legal->isFirstOrderRecurrence(&Phi))
+ fixFirstOrderRecurrence(&Phi);
+ else if (Legal->isReductionVariable(&Phi))
+ fixReduction(&Phi);
+ }
+}
+
+void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have the
+ // following loop.
+ //
+ // for (int i = 0; i < n; ++i)
+ // b[i] = a[i] - a[i - 1];
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, ...
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // temporary value for s1. We now complete the vectorization and produce the
+ // shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // x = v2(3)
+ // br scalar.ph
+ //
+ // scalar.ph:
+ // s_init = phi [x, middle.block], [a[-1], otherwise]
+ // br scalar.body
+ //
+ // After execution completes the vector loop, we extract the next value of
+ // the recurrence (x) to use as the initial value in the scalar loop.
+
+ // Get the original loop preheader and single loop latch.
+ auto *Preheader = OrigLoop->getLoopPreheader();
+ auto *Latch = OrigLoop->getLoopLatch();
+
+ // Get the initial and previous values of the scalar recurrence.
+ auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
+ auto *Previous = Phi->getIncomingValueForBlock(Latch);
+
+ // Create a vector from the initial value.
+ auto *VectorInit = ScalarInit;
+ if (VF > 1) {
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ VectorInit = Builder.CreateInsertElement(
+ UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+ Builder.getInt32(VF - 1), "vector.recur.init");
+ }
+
+ // We constructed a temporary phi node in the first phase of vectorization.
+ // This phi node will eventually be deleted.
+ Builder.SetInsertPoint(
+ cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
+
+ // Create a phi node for the new recurrence. The current value will either be
+ // the initial value inserted into a vector or loop-varying vector value.
+ auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
+ VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
+
+ // Get the vectorized previous value of the last part UF - 1. It appears last
+ // among all unrolled iterations, due to the order of their construction.
+ Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
+
+ // Set the insertion point after the previous value if it is an instruction.
+ // Note that the previous value may have been constant-folded so it is not
+ // guaranteed to be an instruction in the vector loop. Also, if the previous
+ // value is a phi node, we should insert after all the phi nodes to avoid
+ // breaking basic block verification.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
+ isa<PHINode>(PreviousLastPart))
+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+ else
+ Builder.SetInsertPoint(
+ &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
+
+ // We will construct a vector for the recurrence by combining the values for
+ // the current and previous iterations. This is the required shuffle mask.
+ SmallVector<Constant *, 8> ShuffleMask(VF);
+ ShuffleMask[0] = Builder.getInt32(VF - 1);
+ for (unsigned I = 1; I < VF; ++I)
+ ShuffleMask[I] = Builder.getInt32(I + VF - 1);
+
+ // The vector from which to take the initial value for the current iteration
+ // (actual or unrolled). Initially, this is the vector phi node.
+ Value *Incoming = VecPhi;
+
+ // Shuffle the current and previous vector and update the vector parts.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+ Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
+ auto *Shuffle =
+ VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+ ConstantVector::get(ShuffleMask))
+ : Incoming;
+ PhiPart->replaceAllUsesWith(Shuffle);
+ cast<Instruction>(PhiPart)->eraseFromParent();
+ VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+ Incoming = PreviousPart;
+ }
+
+ // Fix the latch value of the new recurrence in the vector loop.
+ VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+
+ // Extract the last vector element in the middle block. This will be the
+ // initial value for the recurrence when jumping to the scalar loop.
+ auto *ExtractForScalar = Incoming;
+ if (VF > 1) {
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ ExtractForScalar = Builder.CreateExtractElement(
+ ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+ }
+ // Extract the second last element in the middle block if the
+ // Phi is used outside the loop. We need to extract the phi itself
+ // and not the last element (the phi update in the current iteration). This
+ // will be the value when jumping to the exit block from the LoopMiddleBlock,
+ // when the scalar loop is not run at all.
+ Value *ExtractForPhiUsedOutsideLoop = nullptr;
+ if (VF > 1)
+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+ Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+ // `Incoming`. This is analogous to the vectorized case above: extracting the
+ // second last element when VF > 1.
+ else if (UF > 1)
+ ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
+
+ // Fix the initial value of the original recurrence in the scalar loop.
+ Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+ auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+ for (auto *BB : predecessors(LoopScalarPreHeader)) {
+ auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
+ Start->addIncoming(Incoming, BB);
+ }
+
+ Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
+ Phi->setName("scalar.recur");
+
+ // Finally, fix users of the recurrence outside the loop. The users will need
+ // either the last value of the scalar recurrence or the last value of the
+ // vector recurrence we extracted in the middle block. Since the loop is in
+ // LCSSA form, we just need to find all the phi nodes for the original scalar
+ // recurrence in the exit block, and then add an edge for the middle block.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getIncomingValue(0) == Phi) {
+ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+ }
+ }
+}
+
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+ Constant *Zero = Builder.getInt32(0);
+
+ // Get it's reduction variable descriptor.
+ assert(Legal->isReductionVariable(Phi) &&
+ "Unable to find the reduction variable");
+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+
+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+ RdxDesc.getMinMaxRecurrenceKind();
+ setDebugLocFromInst(Builder, ReductionStartValue);
+
+ // We need to generate a reduction vector from the incoming scalar.
+ // To do so, we need to generate the 'identity' vector and override
+ // one of the elements with the incoming scalar reduction. We need
+ // to do it in the vector-loop preheader.
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+ // This is the vector-clone of the value that leaves the loop.
+ Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+
+ // Find the reduction identity variable. Zero for addition, or, xor,
+ // one for multiplication, -1 for And.
+ Value *Identity;
+ Value *VectorStart;
+ if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+ RK == RecurrenceDescriptor::RK_FloatMinMax) {
+ // MinMax reduction have the start value as their identify.
+ if (VF == 1) {
+ VectorStart = Identity = ReductionStartValue;
+ } else {
+ VectorStart = Identity =
+ Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+ }
+ } else {
+ // Handle other reduction kinds:
+ Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+ RK, VecTy->getScalarType());
+ if (VF == 1) {
+ Identity = Iden;
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart = ReductionStartValue;
+ } else {
+ Identity = ConstantVector::getSplat(VF, Iden);
+
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart =
+ Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+ }
+ }
+
+ // Fix the vector-loop phi.
+
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+ Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+ Value *Val = getOrCreateVectorValue(LoopVal, Part);
+ // Make sure to add the reduction stat value only to the
+ // first unroll part.
+ Value *StartVal = (Part == 0) ? VectorStart : Identity;
+ cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
+ cast<PHINode>(VecRdxPhi)
+ ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+ }
+
+ // Before each round, move the insertion point right between
+ // the PHIs and the values we are going to write.
+ // This allows us to write both PHINodes and the extractelement
+ // instructions.
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+ setDebugLocFromInst(Builder, LoopExitInst);
+
+ // If tail is folded by masking, the vector value to leave the loop should be
+ // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+ // instead of the former.
+ if (Cost->foldTailByMasking()) {
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VecLoopExitInst =
+ VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *Sel = nullptr;
+ for (User *U : VecLoopExitInst->users()) {
+ if (isa<SelectInst>(U)) {
+ assert(!Sel && "Reduction exit feeding two selects");
+ Sel = U;
+ } else
+ assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+ }
+ assert(Sel && "Reduction exit feeds no select");
+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+ }
+ }
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+ Builder.SetInsertPoint(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+ VectorParts RdxParts(UF);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+ : Builder.CreateZExt(Trunc, VecTy);
+ for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+ UI != RdxParts[Part]->user_end();)
+ if (*UI != Trunc) {
+ (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+ RdxParts[Part] = Extnd;
+ } else {
+ ++UI;
+ }
+ }
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+ }
+ }
+
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
+ unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+
+ // The middle block terminator has already been assigned a DebugLoc here (the
+ // OrigLoop's single latch terminator). We want the whole middle block to
+ // appear to execute on this line because: (a) it is all compiler generated,
+ // (b) these instructions are always executed after evaluating the latch
+ // conditional branch, and (c) other passes may add new predecessors which
+ // terminate on this line. This is the easiest way to ensure we don't
+ // accidentally cause an extra step back into the loop while debugging.
+ setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
+ for (unsigned Part = 1; Part < UF; ++Part) {
+ Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ // Floating point operations had to be 'fast' to enable the reduction.
+ ReducedPartRdx = addFastMathFlag(
+ Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
+ ReducedPartRdx, "bin.rdx"),
+ RdxDesc.getFastMathFlags());
+ else
+ ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
+ RdxPart);
+ }
+
+ if (VF > 1) {
+ bool NoNaN = Legal->hasFunNoNaNAttr();
+ ReducedPartRdx =
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (Phi->getType() != RdxDesc.getRecurrenceType())
+ ReducedPartRdx =
+ RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+ : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+ }
+
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+ BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+ // Now, we need to fix the users of the reduction variable
+ // inside and outside of the scalar remainder loop.
+ // We know that the loop is in LCSSA form. We need to update the
+ // PHI nodes in the exit blocks.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ // All PHINodes need to have a single entry edge, or two if
+ // we already fixed them.
+ assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+ // We found a reduction value exit-PHI. Update it with the
+ // incoming bypass edge.
+ if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+ LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ } // end of the LCSSA phi scan.
+
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
+ int IncomingEdgeBlockIdx =
+ Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getNumIncomingValues() == 1) {
+ auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+ // Non-instruction incoming values will have only one value.
+ unsigned LastLane = 0;
+ if (isa<Instruction>(IncomingValue))
+ LastLane = Cost->isUniformAfterVectorization(
+ cast<Instruction>(IncomingValue), VF)
+ ? 0
+ : VF - 1;
+ // Can be a loop invariant incoming value or the last scalar value to be
+ // extracted from the vectorized loop.
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ Value *lastIncomingValue =
+ getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+ LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
+ }
+ }
+}
+
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+ // The basic block and loop containing the predicated instruction.
+ auto *PredBB = PredInst->getParent();
+ auto *VectorLoop = LI->getLoopFor(PredBB);
+
+ // Initialize a worklist with the operands of the predicated instruction.
+ SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+ // Holds instructions that we need to analyze again. An instruction may be
+ // reanalyzed if we don't yet know if we can sink it or not.
+ SmallVector<Instruction *, 8> InstsToReanalyze;
+
+ // Returns true if a given use occurs in the predicated block. Phi nodes use
+ // their operands in their corresponding predecessor blocks.
+ auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+ auto *I = cast<Instruction>(U.getUser());
+ BasicBlock *BB = I->getParent();
+ if (auto *Phi = dyn_cast<PHINode>(I))
+ BB = Phi->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ return BB == PredBB;
+ };
+
+ // Iteratively sink the scalarized operands of the predicated instruction
+ // into the block we created for it. When an instruction is sunk, it's
+ // operands are then added to the worklist. The algorithm ends after one pass
+ // through the worklist doesn't sink a single instruction.
+ bool Changed;
+ do {
+ // Add the instructions that need to be reanalyzed to the worklist, and
+ // reset the changed indicator.
+ Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+ InstsToReanalyze.clear();
+ Changed = false;
+
+ while (!Worklist.empty()) {
+ auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+ // We can't sink an instruction if it is a phi node, is already in the
+ // predicated block, is not in the loop, or may have side effects.
+ if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+ !VectorLoop->contains(I) || I->mayHaveSideEffects())
+ continue;
+
+ // It's legal to sink the instruction if all its uses occur in the
+ // predicated block. Otherwise, there's nothing to do yet, and we may
+ // need to reanalyze the instruction.
+ if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
+ InstsToReanalyze.push_back(I);
+ continue;
+ }
+
+ // Move the instruction to the beginning of the predicated block, and add
+ // it's operands to the worklist.
+ I->moveBefore(&*PredBB->getFirstInsertionPt());
+ Worklist.insert(I->op_begin(), I->op_end());
+
+ // The sinking may have enabled other instructions to be sunk, so we will
+ // need to iterate.
+ Changed = true;
+ }
+ } while (Changed);
+}
+
+void InnerLoopVectorizer::fixNonInductionPHIs() {
+ for (PHINode *OrigPhi : OrigPHIsToFix) {
+ PHINode *NewPhi =
+ cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
+ unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
+
+ SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
+ predecessors(OrigPhi->getParent()));
+ SmallVector<BasicBlock *, 2> VectorBBPredecessors(
+ predecessors(NewPhi->getParent()));
+ assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
+ "Scalar and Vector BB should have the same number of predecessors");
+
+ // The insertion point in Builder may be invalidated by the time we get
+ // here. Force the Builder insertion point to something valid so that we do
+ // not run into issues during insertion point restore in
+ // getOrCreateVectorValue calls below.
+ Builder.SetInsertPoint(NewPhi);
+
+ // The predecessor order is preserved and we can rely on mapping between
+ // scalar and vector block predecessors.
+ for (unsigned i = 0; i < NumIncomingValues; ++i) {
+ BasicBlock *NewPredBB = VectorBBPredecessors[i];
+
+ // When looking up the new scalar/vector values to fix up, use incoming
+ // values from original phi.
+ Value *ScIncV =
+ OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
+
+ // Scalar incoming value may need a broadcast
+ Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+ NewPhi->addIncoming(NewIncV, NewPredBB);
+ }
+ }
+}
+
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
+ unsigned VF) {
+ PHINode *P = cast<PHINode>(PN);
+ if (EnableVPlanNativePath) {
+ // Currently we enter here in the VPlan-native path for non-induction
+ // PHIs where all control flow is uniform. We simply widen these PHIs.
+ // Create a vector phi with no operands - the vector phi operands will be
+ // set at the end of vector code generation.
+ Type *VecTy =
+ (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
+ VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+ OrigPHIsToFix.push_back(P);
+
+ return;
+ }
+
+ assert(PN->getParent() == OrigLoop->getHeader() &&
+ "Non-header phis should have been handled elsewhere");
+
+ // In order to support recurrences we need to be able to vectorize Phi nodes.
+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+ // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+ // this value when we vectorize all of the instructions that use the PHI.
+ if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // This is phase one of vectorizing PHIs.
+ Type *VecTy =
+ (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ Value *EntryPart = PHINode::Create(
+ VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+ VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+ }
+ return;
+ }
+
+ setDebugLocFromInst(Builder, P);
+
+ // This PHINode must be an induction variable.
+ // Make sure that we know about it.
+ assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
+
+ InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+ const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ switch (II.getKind()) {
+ case InductionDescriptor::IK_NoInduction:
+ llvm_unreachable("Unknown induction");
+ case InductionDescriptor::IK_IntInduction:
+ case InductionDescriptor::IK_FpInduction:
+ llvm_unreachable("Integer/fp induction is handled elsewhere.");
+ case InductionDescriptor::IK_PtrInduction: {
+ // Handle the pointer induction variable case.
+ assert(P->getType()->isPointerTy() && "Unexpected type.");
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd = Induction;
+ PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
+ // These are the scalar results. Notice that we don't generate vector GEPs
+ // because scalar GEPs result in better code.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+ Value *SclrGep =
+ emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+ SclrGep->setName("next.gep");
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+ }
+ }
+ return;
+ }
+ }
+}
+
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+ assert((I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::URem ||
+ I.getOpcode() == Instruction::SRem) &&
+ "Unexpected instruction");
+ Value *Divisor = I.getOperand(1);
+ auto *CInt = dyn_cast<ConstantInt>(Divisor);
+ return !CInt || CInt->isZero();
+}
+
+void InnerLoopVectorizer::widenInstruction(Instruction &I) {
+ switch (I.getOpcode()) {
+ case Instruction::Br:
+ case Instruction::PHI:
+ llvm_unreachable("This instruction is handled by a different recipe.");
+ case Instruction::GetElementPtr: {
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+ auto *GEP = cast<GetElementPtrInst>(&I);
+
+ if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+ VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
+ addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr =
+ OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+ ? GEP->getPointerOperand()
+ : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+ if (OrigLoop->isLoopInvariant(U.get()))
+ Indices.push_back(U.get());
+ else
+ Indices.push_back(getOrCreateVectorValue(U.get(), Part));
+ }
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP =
+ GEP->isInBounds()
+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+ Indices)
+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
+ addMetadata(NewGEP, GEP);
+ }
+ }
+
+ break;
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::FNeg:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen unops and binops.
+ setDebugLocFromInst(Builder, &I);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 2> Ops;
+ for (Value *Op : I.operands())
+ Ops.push_back(getOrCreateVectorValue(Op, Part));
+
+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+ if (auto *VecOp = dyn_cast<Instruction>(V))
+ VecOp->copyIRFlags(&I);
+
+ // Use this vector value for all users of the original instruction.
+ VectorLoopValueMap.setVectorValue(&I, Part, V);
+ addMetadata(V, &I);
+ }
+
+ break;
+ }
+ case Instruction::Select: {
+ // Widen selects.
+ // If the selector is loop invariant we can create a select
+ // instruction with a scalar condition. Otherwise, use vector-select.
+ auto *SE = PSE.getSE();
+ bool InvariantCond =
+ SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+ setDebugLocFromInst(Builder, &I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+
+ auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
+ Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
+ Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
+ Value *Sel =
+ Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
+ VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+ addMetadata(Sel, &I);
+ }
+
+ break;
+ }
+
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (I.getOpcode() == Instruction::FCmp);
+ auto *Cmp = cast<CmpInst>(&I);
+ setDebugLocFromInst(Builder, Cmp);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
+ Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
+ Value *C = nullptr;
+ if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ }
+ VectorLoopValueMap.setVectorValue(&I, Part, C);
+ addMetadata(C, &I);
+ }
+
+ break;
+ }
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto *CI = cast<CastInst>(&I);
+ setDebugLocFromInst(Builder, CI);
+
+ /// Vectorize casts.
+ Type *DestTy =
+ (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+ VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+ addMetadata(Cast, &I);
+ }
+ break;
+ }
+
+ case Instruction::Call: {
+ // Ignore dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ break;
+ setDebugLocFromInst(Builder, &I);
+
+ Module *M = I.getParent()->getParent()->getParent();
+ auto *CI = cast<CallInst>(&I);
+
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Function *F = CI->getCalledFunction();
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (Value *ArgOperand : CI->arg_operands())
+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 4> Args;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ Value *Arg = CI->getArgOperand(i);
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+ Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
+ Args.push_back(Arg);
+ }
+
+ Function *VectorF;
+ if (UseVectorIntrinsic) {
+ // Use vector version of the intrinsic.
+ Type *TysForDecl[] = {CI->getType()};
+ if (VF > 1)
+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ } else {
+ // Use vector version of the library call.
+ StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+ assert(!VFnName.empty() && "Vector function name is empty.");
+ VectorF = M->getFunction(VFnName);
+ if (!VectorF) {
+ // Generate a declaration
+ FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+ VectorF =
+ Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+ VectorF->copyAttributesFrom(F);
+ }
+ }
+ assert(VectorF && "Can't create vector function.");
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+ if (isa<FPMathOperator>(V))
+ V->copyFastMathFlags(CI);
+
+ VectorLoopValueMap.setVectorValue(&I, Part, V);
+ addMetadata(V, &I);
+ }
+
+ break;
+ }
+
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+ // Forget the original basic block.
+ PSE.getSE()->forgetLoop(OrigLoop);
+
+ // DT is not kept up-to-date for outer loop vectorization
+ if (EnableVPlanNativePath)
+ return;
+
+ // Update the dominator tree information.
+ assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+ "Entry does not dominate exit.");
+
+ DT->addNewBlock(LoopMiddleBlock,
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+ DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
+ DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+ DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+}
+
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+ // We should not collect Scalars more than once per VF. Right now, this
+ // function is called from collectUniformsAndScalars(), which already does
+ // this check. Collecting Scalars for VF=1 does not make any sense.
+ assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
+ "This function should not be visited twice for the same VF");
+
+ SmallSetVector<Instruction *, 8> Worklist;
+
+ // These sets are used to seed the analysis with pointers used by memory
+ // accesses that will remain scalar.
+ SmallSetVector<Instruction *, 8> ScalarPtrs;
+ SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+ // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+ // The pointer operands of loads and stores will be scalar as long as the
+ // memory access is not a gather or scatter operation. The value operand of a
+ // store will remain scalar if the store is scalarized.
+ auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+ InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+ if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+ if (Ptr == Store->getValueOperand())
+ return WideningDecision == CM_Scalarize;
+ assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
+ "Ptr is neither a value or pointer operand");
+ return WideningDecision != CM_GatherScatter;
+ };
+
+ // A helper that returns true if the given value is a bitcast or
+ // getelementptr instruction contained in the loop.
+ auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+ return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+ isa<GetElementPtrInst>(V)) &&
+ !TheLoop->isLoopInvariant(V);
+ };
+
+ // A helper that evaluates a memory access's use of a pointer. If the use
+ // will be a scalar use, and the pointer is only used by memory accesses, we
+ // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+ // PossibleNonScalarPtrs.
+ auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+ // We only care about bitcast and getelementptr instructions contained in
+ // the loop.
+ if (!isLoopVaryingBitCastOrGEP(Ptr))
+ return;
+
+ // If the pointer has already been identified as scalar (e.g., if it was
+ // also identified as uniform), there's nothing to do.
+ auto *I = cast<Instruction>(Ptr);
+ if (Worklist.count(I))
+ return;
+
+ // If the use of the pointer will be a scalar use, and all users of the
+ // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+ // place the pointer in PossibleNonScalarPtrs.
+ if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+ return isa<LoadInst>(U) || isa<StoreInst>(U);
+ }))
+ ScalarPtrs.insert(I);
+ else
+ PossibleNonScalarPtrs.insert(I);
+ };
+
+ // We seed the scalars analysis with three classes of instructions: (1)
+ // instructions marked uniform-after-vectorization, (2) bitcast and
+ // getelementptr instructions used by memory accesses requiring a scalar use,
+ // and (3) pointer induction variables and their update instructions (we
+ // currently only scalarize these).
+ //
+ // (1) Add to the worklist all instructions that have been identified as
+ // uniform-after-vectorization.
+ Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+ // (2) Add to the worklist all bitcast and getelementptr instructions used by
+ // memory accesses requiring a scalar use. The pointer operands of loads and
+ // stores will be scalar as long as the memory accesses is not a gather or
+ // scatter operation. The value operand of a store will remain scalar if the
+ // store is scalarized.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ if (auto *Load = dyn_cast<LoadInst>(&I)) {
+ evaluatePtrUse(Load, Load->getPointerOperand());
+ } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+ evaluatePtrUse(Store, Store->getPointerOperand());
+ evaluatePtrUse(Store, Store->getValueOperand());
+ }
+ }
+ for (auto *I : ScalarPtrs)
+ if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+ Worklist.insert(I);
+ }
+
+ // (3) Add to the worklist all pointer induction variables and their update
+ // instructions.
+ //
+ // TODO: Once we are able to vectorize pointer induction variables we should
+ // no longer insert them into the worklist here.
+ auto *Latch = TheLoop->getLoopLatch();
+ for (auto &Induction : *Legal->getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+ continue;
+ Worklist.insert(Ind);
+ Worklist.insert(IndUpdate);
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
+ }
+
+ // Insert the forced scalars.
+ // FIXME: Currently widenPHIInstruction() often creates a dead vector
+ // induction variable when the PHI user is scalarized.
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (ForcedScalar != ForcedScalars.end())
+ for (auto *I : ForcedScalar->second)
+ Worklist.insert(I);
+
+ // Expand the worklist by looking through any bitcasts and getelementptr
+ // instructions we've already identified as scalar. This is similar to the
+ // expansion step in collectLoopUniforms(); however, here we're only
+ // expanding to include additional bitcasts and getelementptr instructions.
+ unsigned Idx = 0;
+ while (Idx != Worklist.size()) {
+ Instruction *Dst = Worklist[Idx++];
+ if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+ continue;
+ auto *Src = cast<Instruction>(Dst->getOperand(0));
+ if (llvm::all_of(Src->users(), [&](User *U) -> bool {
+ auto *J = cast<Instruction>(U);
+ return !TheLoop->contains(J) || Worklist.count(J) ||
+ ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+ isScalarUse(J, Src));
+ })) {
+ Worklist.insert(Src);
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+ }
+ }
+
+ // An induction variable will remain scalar if all users of the induction
+ // variable and induction variable update remain scalar.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // We already considered pointer induction variables, so there's no reason
+ // to look at their users again.
+ //
+ // TODO: Once we are able to vectorize pointer induction variables we
+ // should no longer skip over them here.
+ if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+ continue;
+
+ // Determine if all users of the induction variable are scalar after
+ // vectorization.
+ auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+ });
+ if (!ScalarInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // scalar after vectorization.
+ auto ScalarIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ });
+ if (!ScalarIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain scalar.
+ Worklist.insert(Ind);
+ Worklist.insert(IndUpdate);
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
+ }
+
+ Scalars[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ switch(I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ auto *Ptr = getLoadStorePointerOperand(I);
+ auto *Ty = getMemInstValueType(I);
+ // We have already decided how to vectorize this instruction, get that
+ // result.
+ if (VF > 1) {
+ InstWidening WideningDecision = getWideningDecision(I, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+ return WideningDecision == CM_Scalarize;
+ }
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ return isa<LoadInst>(I) ?
+ !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return mayDivideByZero(*I);
+ }
+ return false;
+}
+
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+ unsigned VF) {
+ assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+ assert(getWideningDecision(I, VF) == CM_Unknown &&
+ "Decision should not be set yet.");
+ auto *Group = getInterleavedAccessGroup(I);
+ assert(Group && "Must have a group.");
+
+ // If the instruction's allocated size doesn't equal it's type size, it
+ // requires padding and will be scalarized.
+ auto &DL = I->getModule()->getDataLayout();
+ auto *ScalarTy = getMemInstValueType(I);
+ if (hasIrregularType(ScalarTy, DL, VF))
+ return false;
+
+ // Check if masking is required.
+ // A Group may need masking for one of two reasons: it resides in a block that
+ // needs predication, or it was decided to use masking to deal with gaps.
+ bool PredicatedAccessRequiresMasking =
+ Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+ if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+ return true;
+
+ // If masked interleaving is required, we expect that the user/target had
+ // enabled it, because otherwise it either wouldn't have been created or
+ // it should have been invalidated by the CostModel.
+ assert(useMaskedInterleavedAccesses(TTI) &&
+ "Masked interleave-groups for predicated accesses are not enabled.");
+
+ auto *Ty = getMemInstValueType(I);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+ : TTI.isLegalMaskedStore(Ty, Alignment);
+}
+
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+ unsigned VF) {
+ // Get and ensure we have a valid memory instruction.
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+ assert((LI || SI) && "Invalid memory instruction");
+
+ auto *Ptr = getLoadStorePointerOperand(I);
+
+ // In order to be widened, the pointer should be consecutive, first of all.
+ if (!Legal->isConsecutivePtr(Ptr))
+ return false;
+
+ // If the instruction is a store located in a predicated block, it will be
+ // scalarized.
+ if (isScalarWithPredication(I))
+ return false;
+
+ // If the instruction's allocated size doesn't equal it's type size, it
+ // requires padding and will be scalarized.
+ auto &DL = I->getModule()->getDataLayout();
+ auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ if (hasIrregularType(ScalarTy, DL, VF))
+ return false;
+
+ return true;
+}
+
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+ // We should not collect Uniforms more than once per VF. Right now,
+ // this function is called from collectUniformsAndScalars(), which
+ // already does this check. Collecting Uniforms for VF=1 does not make any
+ // sense.
+
+ assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
+ "This function should not be visited twice for the same VF");
+
+ // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+ // not analyze again. Uniforms.count(VF) will return 1.
+ Uniforms[VF].clear();
+
+ // We now know that the loop is vectorizable!
+ // Collect instructions inside the loop that will remain uniform after
+ // vectorization.
+
+ // Global values, params and instructions outside of current loop are out of
+ // scope.
+ auto isOutOfScope = [&](Value *V) -> bool {
+ Instruction *I = dyn_cast<Instruction>(V);
+ return (!I || !TheLoop->contains(I));
+ };
+
+ SetVector<Instruction *> Worklist;
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+
+ // Start with the conditional branch. If the branch condition is an
+ // instruction contained in the loop that is only used by the branch, it is
+ // uniform.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
+ Worklist.insert(Cmp);
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+ }
+
+ // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
+ // are pointers that are treated like consecutive pointers during
+ // vectorization. The pointer operands of interleaved accesses are an
+ // example.
+ SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
+
+ // Holds pointer operands of instructions that are possibly non-uniform.
+ SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
+
+ auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+ InstWidening WideningDecision = getWideningDecision(I, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+
+ return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave);
+ };
+ // Iterate over the instructions in the loop, and collect all
+ // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
+ // that a consecutive-like pointer operand will be scalarized, we collect it
+ // in PossibleNonUniformPtrs instead. We use two sets here because a single
+ // getelementptr instruction can be used by both vectorized and scalarized
+ // memory instructions. For example, if a loop loads and stores from the same
+ // location, but the store is conditional, the store will be scalarized, and
+ // the getelementptr won't remain uniform.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ // If there's no pointer operand, there's nothing to do.
+ auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+ if (!Ptr)
+ continue;
+
+ // True if all users of Ptr are memory accesses that have Ptr as their
+ // pointer operand.
+ auto UsersAreMemAccesses =
+ llvm::all_of(Ptr->users(), [&](User *U) -> bool {
+ return getLoadStorePointerOperand(U) == Ptr;
+ });
+
+ // Ensure the memory instruction will not be scalarized or used by
+ // gather/scatter, making its pointer operand non-uniform. If the pointer
+ // operand is used by any instruction other than a memory access, we
+ // conservatively assume the pointer operand may be non-uniform.
+ if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
+ PossibleNonUniformPtrs.insert(Ptr);
+
+ // If the memory instruction will be vectorized and its pointer operand
+ // is consecutive-like, or interleaving - the pointer operand should
+ // remain uniform.
+ else
+ ConsecutiveLikePtrs.insert(Ptr);
+ }
+
+ // Add to the Worklist all consecutive and consecutive-like pointers that
+ // aren't also identified as possibly non-uniform.
+ for (auto *V : ConsecutiveLikePtrs)
+ if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+ Worklist.insert(V);
+ }
+
+ // Expand Worklist in topological order: whenever a new instruction
+ // is added , its users should be already inside Worklist. It ensures
+ // a uniform instruction will only be used by uniform instructions.
+ unsigned idx = 0;
+ while (idx != Worklist.size()) {
+ Instruction *I = Worklist[idx++];
+
+ for (auto OV : I->operand_values()) {
+ // isOutOfScope operands cannot be uniform instructions.
+ if (isOutOfScope(OV))
+ continue;
+ // First order recurrence Phi's should typically be considered
+ // non-uniform.
+ auto *OP = dyn_cast<PHINode>(OV);
+ if (OP && Legal->isFirstOrderRecurrence(OP))
+ continue;
+ // If all the users of the operand are uniform, then add the
+ // operand into the uniform worklist.
+ auto *OI = cast<Instruction>(OV);
+ if (llvm::all_of(OI->users(), [&](User *U) -> bool {
+ auto *J = cast<Instruction>(U);
+ return Worklist.count(J) ||
+ (OI == getLoadStorePointerOperand(J) &&
+ isUniformDecision(J, VF));
+ })) {
+ Worklist.insert(OI);
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+ }
+ }
+ }
+
+ // Returns true if Ptr is the pointer operand of a memory access instruction
+ // I, and I is known to not require scalarization.
+ auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+ return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ };
+
+ // For an instruction to be added into Worklist above, all its users inside
+ // the loop should also be in Worklist. However, this condition cannot be
+ // true for phi nodes that form a cyclic dependence. We must process phi
+ // nodes separately. An induction variable will remain uniform if all users
+ // of the induction variable and induction variable update remain uniform.
+ // The code below handles both pointer and non-pointer induction variables.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // Determine if all users of the induction variable are uniform after
+ // vectorization.
+ auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, Ind);
+ });
+ if (!UniformInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // uniform after vectorization.
+ auto UniformIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, IndUpdate);
+ });
+ if (!UniformIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain uniform.
+ Worklist.insert(Ind);
+ Worklist.insert(IndUpdate);
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
+ << "\n");
+ }
+
+ Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+ LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
+
+ if (Legal->getRuntimePointerChecking()->Need) {
+ reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+ "runtime pointer checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+ "runtime SCEV checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ // FIXME: Avoid specializing for stride==1 instead of bailing out.
+ if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+ reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+ "runtime stride == 1 checks needed. Enable vectorization of "
+ "this loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ reportVectorizationFailure(
+ "Not inserting runtime ptr check for divergent target",
+ "runtime pointer checks needed. Not enabled for divergent target",
+ "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
+ return None;
+ }
+
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+ if (TC == 1) {
+ reportVectorizationFailure("Single iteration (non) loop",
+ "loop trip count is one, irrelevant for vectorization",
+ "SingleIterationLoop", ORE, TheLoop);
+ return None;
+ }
+
+ switch (ScalarEpilogueStatus) {
+ case CM_ScalarEpilogueAllowed:
+ return computeFeasibleMaxVF(TC);
+ case CM_ScalarEpilogueNotNeededUsePredicate:
+ LLVM_DEBUG(
+ dbgs() << "LV: vector predicate hint/switch found.\n"
+ << "LV: Not allowing scalar epilogue, creating predicated "
+ << "vector loop.\n");
+ break;
+ case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ // fallthrough as a special case of OptForSize
+ case CM_ScalarEpilogueNotAllowedOptSize:
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ LLVM_DEBUG(
+ dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ else
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+ << "count.\n");
+
+ // Bail if runtime checks are required, which are not good when optimising
+ // for size.
+ if (runtimeChecksRequired())
+ return None;
+ break;
+ }
+
+ // Now try the tail folding
+
+ // Invalidate interleave groups that require an epilogue if we can't mask
+ // the interleave-group.
+ if (!useMaskedInterleavedAccesses(TTI))
+ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+
+ unsigned MaxVF = computeFeasibleMaxVF(TC);
+ if (TC > 0 && TC % MaxVF == 0) {
+ // Accept MaxVF if we do not have a tail.
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxVF;
+ }
+
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ if (Legal->prepareToFoldTailByMasking()) {
+ FoldTailByMasking = true;
+ return MaxVF;
+ }
+
+ if (TC == 0) {
+ reportVectorizationFailure(
+ "Unable to calculate the loop count due to complex control flow",
+ "unable to calculate the loop count due to complex control flow",
+ "UnknownLoopCountComplexCFG", ORE, TheLoop);
+ return None;
+ }
+
+ reportVectorizationFailure(
+ "Cannot optimize for size and vectorize at the same time.",
+ "cannot optimize for size and vectorize at the same time. "
+ "Enable vectorization of this loop with '#pragma clang loop "
+ "vectorize(enable)' when compiling with -Os/-Oz",
+ "NoTailLoopWithOptForSize", ORE, TheLoop);
+ return None;
+}
+
+unsigned
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+ unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+
+ WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
+
+ unsigned MaxVectorSize = WidestRegister / WidestType;
+
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+ << WidestRegister << " bits.\n");
+
+ assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
+ " into one vector!");
+ if (MaxVectorSize == 0) {
+ LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ MaxVectorSize = 1;
+ return MaxVectorSize;
+ } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+ isPowerOf2_32(ConstTripCount)) {
+ // We need to clamp the VF to be the ConstTripCount. There is no point in
+ // choosing a higher viable VF as done in the loop below.
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
+ MaxVectorSize = ConstTripCount;
+ return MaxVectorSize;
+ }
+
+ unsigned MaxVF = MaxVectorSize;
+ if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+ (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ // Collect all viable vectorization factors larger than the default MaxVF
+ // (i.e. MaxVectorSize).
+ SmallVector<unsigned, 8> VFs;
+ unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+ for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
+ VFs.push_back(VS);
+
+ // For each VF calculate its register usage.
+ auto RUs = calculateRegisterUsage(VFs);
+
+ // Select the largest VF which doesn't require more registers than existing
+ // ones.
+ for (int i = RUs.size() - 1; i >= 0; --i) {
+ bool Selected = true;
+ for (auto& pair : RUs[i].MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ if (pair.second > TargetNumRegisters)
+ Selected = false;
+ }
+ if (Selected) {
+ MaxVF = VFs[i];
+ break;
+ }
+ }
+ if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+ if (MaxVF < MinVF) {
+ LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+ << ") with target's minimum: " << MinVF << '\n');
+ MaxVF = MinVF;
+ }
+ }
+ }
+ return MaxVF;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
+ float Cost = expectedCost(1).first;
+ const float ScalarCost = Cost;
+ unsigned Width = 1;
+ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+
+ bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+ if (ForceVectorization && MaxVF > 1) {
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+ // evaluation.
+ Cost = std::numeric_limits<float>::max();
+ }
+
+ for (unsigned i = 2; i <= MaxVF; i *= 2) {
+ // Notice that the vector loop needs to be executed less times, so
+ // we need to divide the cost of the vector loops by the width of
+ // the vector elements.
+ VectorizationCostTy C = expectedCost(i);
+ float VectorCost = C.first / (float)i;
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (int)VectorCost << ".\n");
+ if (!C.second && !ForceVectorization) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Not considering vector loop of width " << i
+ << " because it will not generate any vector instructions.\n");
+ continue;
+ }
+ if (VectorCost < Cost) {
+ Cost = VectorCost;
+ Width = i;
+ }
+ }
+
+ if (!EnableCondStoresVectorization && NumPredStores) {
+ reportVectorizationFailure("There are conditional stores.",
+ "store that is conditionally executed prevents vectorization",
+ "ConditionalStore", ORE, TheLoop);
+ Width = 1;
+ Cost = ScalarCost;
+ }
+
+ LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ << "LV: Vectorization seems to be not beneficial, "
+ << "but was forced by a user.\n");
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+ VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
+ return Factor;
+}
+
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+ unsigned MinWidth = -1U;
+ unsigned MaxWidth = 8;
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ // For each block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the loop.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ Type *T = I.getType();
+
+ // Skip ignored values.
+ if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
+ continue;
+
+ // Only examine Loads, Stores and PHINodes.
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
+ continue;
+
+ // Examine PHI nodes that are reduction variables. Update the type to
+ // account for the recurrence type.
+ if (auto *PN = dyn_cast<PHINode>(&I)) {
+ if (!Legal->isReductionVariable(PN))
+ continue;
+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+ T = RdxDesc.getRecurrenceType();
+ }
+
+ // Examine the stored values.
+ if (auto *ST = dyn_cast<StoreInst>(&I))
+ T = ST->getValueOperand()->getType();
+
+ // Ignore loaded pointer types and stored pointer types that are not
+ // vectorizable.
+ //
+ // FIXME: The check here attempts to predict whether a load or store will
+ // be vectorized. We only know this for certain after a VF has
+ // been selected. Here, we assume that if an access can be
+ // vectorized, it will be. We should also look at extending this
+ // optimization to non-pointer types.
+ //
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+ !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+ continue;
+
+ MinWidth = std::min(MinWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ MaxWidth = std::max(MaxWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ }
+ }
+
+ return {MinWidth, MaxWidth};
+}
+
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
+ unsigned LoopCost) {
+ // -- The interleave heuristics --
+ // We interleave the loop in order to expose ILP and reduce the loop overhead.
+ // There are many micro-architectural considerations that we can't predict
+ // at this level. For example, frontend pressure (on decode or fetch) due to
+ // code size, or the number and capabilities of the execution ports.
+ //
+ // We use the following heuristics to select the interleave count:
+ // 1. If the code has reductions, then we interleave to break the cross
+ // iteration dependency.
+ // 2. If the loop is really small, then we interleave to reduce the loop
+ // overhead.
+ // 3. We don't interleave if we think that we will spill registers to memory
+ // due to the increased register pressure.
+
+ if (!isScalarEpilogueAllowed())
+ return 1;
+
+ // We used the distance for the interleave count.
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ return 1;
+
+ // Do not interleave loops with a relatively small trip count.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
+ return 1;
+
+ RegisterUsage R = calculateRegisterUsage({VF})[0];
+ // We divide by these constants so assume that we have at least one
+ // instruction that uses at least one register.
+ for (auto& pair : R.MaxLocalUsers) {
+ pair.second = std::max(pair.second, 1U);
+ }
+
+ // We calculate the interleave count using the following formula.
+ // Subtract the number of loop invariants from the number of available
+ // registers. These registers are used by all of the interleaved instances.
+ // Next, divide the remaining registers by the number of registers that is
+ // required by the loop, in order to estimate how many parallel instances
+ // fit without causing spills. All of this is rounded down if necessary to be
+ // a power of two. We want power of two interleave count to simplify any
+ // addressing operations or alignment considerations.
+ // We also want power of two interleave counts to ensure that the induction
+ // variable of the vector loop wraps to zero, when tail is folded by masking;
+ // this currently happens when OptForSize, in which case IC is set to 1 above.
+ unsigned IC = UINT_MAX;
+
+ for (auto& pair : R.MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers of "
+ << TTI.getRegisterClassName(pair.first) << " register class\n");
+ if (VF == 1) {
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
+ unsigned MaxLocalUsers = pair.second;
+ unsigned LoopInvariantRegs = 0;
+ if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+ LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+ unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ // Don't count the induction variable as interleaved.
+ if (EnableIndVarRegisterHeur) {
+ TmpIC =
+ PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
+ }
+
+ IC = std::min(IC, TmpIC);
+ }
+
+ // Clamp the interleave ranges to reasonable counts.
+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+
+ // Check if the user has overridden the max.
+ if (VF == 1) {
+ if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
+ } else {
+ if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
+ }
+
+ // If the trip count is constant, limit the interleave count to be less than
+ // the trip count divided by VF.
+ if (TC > 0) {
+ assert(TC >= VF && "VF exceeds trip count?");
+ if ((TC / VF) < MaxInterleaveCount)
+ MaxInterleaveCount = (TC / VF);
+ }
+
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0)
+ LoopCost = expectedCost(VF).first;
+
+ assert(LoopCost && "Non-zero loop cost expected");
+
+ // Clamp the calculated IC to be between the 1 and the max interleave count
+ // that the target and trip count allows.
+ if (IC > MaxInterleaveCount)
+ IC = MaxInterleaveCount;
+ else if (IC < 1)
+ IC = 1;
+
+ // Interleave if we vectorized this loop and there is a reduction that could
+ // benefit from interleaving.
+ if (VF > 1 && !Legal->getReductionVars()->empty()) {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ return IC;
+ }
+
+ // Note that if we've already vectorized the loop we will have done the
+ // runtime check and so interleaving won't require further checks.
+ bool InterleavingRequiresRuntimePointerCheck =
+ (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+
+ // We want to interleave small loops in order to reduce the loop overhead and
+ // potentially expose ILP opportunities.
+ LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+ // We assume that the cost overhead is 1 and we use the cost model
+ // to estimate the cost of the loop and interleave until the cost of the
+ // loop overhead is about 5% of the cost of the loop.
+ unsigned SmallIC =
+ std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+ // Interleave until store/load ports (estimated by max interleave count) are
+ // saturated.
+ unsigned NumStores = Legal->getNumStores();
+ unsigned NumLoads = Legal->getNumLoads();
+ unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+ unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+
+ // If we have a scalar reduction (vector reductions are already dealt with
+ // by this point), we can increase the critical path length if the loop
+ // we're interleaving is inside another loop. Limit, by default to 2, so the
+ // critical path only gets increased by one reduction operation.
+ if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
+ unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+ SmallIC = std::min(SmallIC, F);
+ StoresIC = std::min(StoresIC, F);
+ LoadsIC = std::min(LoadsIC, F);
+ }
+
+ if (EnableLoadStoreRuntimeInterleave &&
+ std::max(StoresIC, LoadsIC) > SmallIC) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ return std::max(StoresIC, LoadsIC);
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ return SmallIC;
+ }
+
+ // Interleave if this is a large loop (small loops are already dealt with by
+ // this point) that could benefit from interleaving.
+ bool HasReductions = !Legal->getReductionVars()->empty();
+ if (TTI.enableAggressiveInterleaving(HasReductions)) {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ return IC;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
+ return 1;
+}
+
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+ // This function calculates the register usage by measuring the highest number
+ // of values that are alive at a single location. Obviously, this is a very
+ // rough estimation. We scan the loop in a topological order in order and
+ // assign a number to each instruction. We use RPO to ensure that defs are
+ // met before their users. We assume that each instruction that has in-loop
+ // users starts an interval. We record every time that an in-loop value is
+ // used, so we have a list of the first and last occurrences of each
+ // instruction. Next, we transpose this data structure into a multi map that
+ // holds the list of intervals that *end* at a specific location. This multi
+ // map allows us to perform a linear search. We scan the instructions linearly
+ // and record each time that a new interval starts, by placing it in a set.
+ // If we find this value in the multi-map then we remove it from the set.
+ // The max register usage is the maximum size of the set.
+ // We also search for instructions that are defined outside the loop, but are
+ // used inside the loop. We need this number separately from the max-interval
+ // usage number because when we unroll, loop-invariant values do not take
+ // more register.
+ LoopBlocksDFS DFS(TheLoop);
+ DFS.perform(LI);
+
+ RegisterUsage RU;
+
+ // Each 'key' in the map opens a new interval. The values
+ // of the map are the index of the 'last seen' usage of the
+ // instruction that is the key.
+ using IntervalMap = DenseMap<Instruction *, unsigned>;
+
+ // Maps instruction to its index.
+ SmallVector<Instruction *, 64> IdxToInstr;
+ // Marks the end of each interval.
+ IntervalMap EndPoint;
+ // Saves the list of instruction indices that are used in the loop.
+ SmallPtrSet<Instruction *, 8> Ends;
+ // Saves the list of values that are used in the loop but are
+ // defined outside the loop, such as arguments and constants.
+ SmallPtrSet<Value *, 8> LoopInvariants;
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ IdxToInstr.push_back(&I);
+
+ // Save the end location of each USE.
+ for (Value *U : I.operands()) {
+ auto *Instr = dyn_cast<Instruction>(U);
+
+ // Ignore non-instruction values such as arguments, constants, etc.
+ if (!Instr)
+ continue;
+
+ // If this instruction is outside the loop then record it and continue.
+ if (!TheLoop->contains(Instr)) {
+ LoopInvariants.insert(Instr);
+ continue;
+ }
+
+ // Overwrite previous end points.
+ EndPoint[Instr] = IdxToInstr.size();
+ Ends.insert(Instr);
+ }
+ }
+ }
+
+ // Saves the list of intervals that end with the index in 'key'.
+ using InstrList = SmallVector<Instruction *, 2>;
+ DenseMap<unsigned, InstrList> TransposeEnds;
+
+ // Transpose the EndPoints to a list of values that end at each index.
+ for (auto &Interval : EndPoint)
+ TransposeEnds[Interval.second].push_back(Interval.first);
+
+ SmallPtrSet<Instruction *, 8> OpenIntervals;
+
+ // Get the size of the widest register.
+ unsigned MaxSafeDepDist = -1U;
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+ unsigned WidestRegister =
+ std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ SmallVector<RegisterUsage, 8> RUs(VFs.size());
+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+
+ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+ // A lambda that gets the register usage for the given type and VF.
+ auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+ if (Ty->isTokenTy())
+ return 0U;
+ unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+ return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+ };
+
+ for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
+ Instruction *I = IdxToInstr[i];
+
+ // Remove all of the instructions that end at this location.
+ InstrList &List = TransposeEnds[i];
+ for (Instruction *ToRemove : List)
+ OpenIntervals.erase(ToRemove);
+
+ // Ignore instructions that are never used within the loop.
+ if (Ends.find(I) == Ends.end())
+ continue;
+
+ // Skip ignored values.
+ if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
+ continue;
+
+ // For each VF find the maximum usage of registers.
+ for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ // Count the number of live intervals.
+ SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
+ if (VFs[j] == 1) {
+ for (auto Inst : OpenIntervals) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ }
+ } else {
+ collectUniformsAndScalars(VFs[j]);
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+ continue;
+ if (isScalarAfterVectorization(Inst, VFs[j])) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ } else {
+ unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+ else
+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ }
+ }
+
+ for (auto& pair : RegUsage) {
+ if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+ MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+ else
+ MaxUsages[j][pair.first] = pair.second;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
+
+ // Add the current instruction to the list of open intervals.
+ OpenIntervals.insert(I);
+ }
+
+ for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+ for (auto Inst : LoopInvariants) {
+ unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+ if (Invariant.find(ClassID) == Invariant.end())
+ Invariant[ClassID] = Usage;
+ else
+ Invariant[ClassID] += Usage;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+ dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+ << " item\n";
+ for (const auto &pair : MaxUsages[i]) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+ << " item\n";
+ for (const auto &pair : Invariant) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ });
+
+ RU.LoopInvariantRegs = Invariant;
+ RU.MaxLocalUsers = MaxUsages[i];
+ RUs[i] = RU;
+ }
+
+ return RUs;
+}
+
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+ // TODO: Cost model for emulated masked load/store is completely
+ // broken. This hack guides the cost model to use an artificially
+ // high enough value to practically disable vectorization with such
+ // operations, except where previously deployed legality hack allowed
+ // using very low cost values. This is to avoid regressions coming simply
+ // from moving "masked load/store" check from legality to cost model.
+ // Masked Load/Gather emulation was previously never allowed.
+ // Limited number of Masked Store/Scatter emulation was allowed.
+ assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
+ return isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ NumPredStores > NumberOfStoresToPredicate);
+}
+
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+ // If we aren't vectorizing the loop, or if we've already collected the
+ // instructions to scalarize, there's nothing to do. Collection may already
+ // have occurred if we have a user-selected VF and are now computing the
+ // expected cost for interleaving.
+ if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
+ return;
+
+ // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+ // not profitable to scalarize any instructions, the presence of VF in the
+ // map will indicate that we've analyzed it already.
+ ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+ // Find all the instructions that are scalar with predication in the loop and
+ // determine if it would be better to not if-convert the blocks they are in.
+ // If so, we also record the instructions to scalarize.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!blockNeedsPredication(BB))
+ continue;
+ for (Instruction &I : *BB)
+ if (isScalarWithPredication(&I)) {
+ ScalarCostsTy ScalarCosts;
+ // Do not apply discount logic if hacked cost is needed
+ // for emulated masked memrefs.
+ if (!useEmulatedMaskMemRefHack(&I) &&
+ computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+ // Remember that BB will remain after vectorization.
+ PredicatedBBsAfterVectorization.insert(BB);
+ }
+ }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+ Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+ unsigned VF) {
+ assert(!isUniformAfterVectorization(PredInst, VF) &&
+ "Instruction marked uniform-after-vectorization will be predicated");
+
+ // Initialize the discount to zero, meaning that the scalar version and the
+ // vector version cost the same.
+ int Discount = 0;
+
+ // Holds instructions to analyze. The instructions we visit are mapped in
+ // ScalarCosts. Those instructions are the ones that would be scalarized if
+ // we find that the scalar version costs less.
+ SmallVector<Instruction *, 8> Worklist;
+
+ // Returns true if the given instruction can be scalarized.
+ auto canBeScalarized = [&](Instruction *I) -> bool {
+ // We only attempt to scalarize instructions forming a single-use chain
+ // from the original predicated block that would otherwise be vectorized.
+ // Although not strictly necessary, we give up on instructions we know will
+ // already be scalar to avoid traversing chains that are unlikely to be
+ // beneficial.
+ if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+ isScalarAfterVectorization(I, VF))
+ return false;
+
+ // If the instruction is scalar with predication, it will be analyzed
+ // separately. We ignore it within the context of PredInst.
+ if (isScalarWithPredication(I))
+ return false;
+
+ // If any of the instruction's operands are uniform after vectorization,
+ // the instruction cannot be scalarized. This prevents, for example, a
+ // masked load from being scalarized.
+ //
+ // We assume we will only emit a value for lane zero of an instruction
+ // marked uniform after vectorization, rather than VF identical values.
+ // Thus, if we scalarize an instruction that uses a uniform, we would
+ // create uses of values corresponding to the lanes we aren't emitting code
+ // for. This behavior can be changed by allowing getScalarValue to clone
+ // the lane zero values for uniforms rather than asserting.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (isUniformAfterVectorization(J, VF))
+ return false;
+
+ // Otherwise, we can scalarize the instruction.
+ return true;
+ };
+
+ // Compute the expected cost discount from scalarizing the entire expression
+ // feeding the predicated instruction. We currently only consider expressions
+ // that are single-use instruction chains.
+ Worklist.push_back(PredInst);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+
+ // If we've already analyzed the instruction, there's nothing to do.
+ if (ScalarCosts.find(I) != ScalarCosts.end())
+ continue;
+
+ // Compute the cost of the vector instruction. Note that this cost already
+ // includes the scalarization overhead of the predicated instruction.
+ unsigned VectorCost = getInstructionCost(I, VF).first;
+
+ // Compute the cost of the scalarized instruction. This cost is the cost of
+ // the instruction as if it wasn't if-converted and instead remained in the
+ // predicated block. We will scale this cost by block probability after
+ // computing the scalarization overhead.
+ unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+ // Compute the scalarization overhead of needed insertelement instructions
+ // and phi nodes.
+ if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+ true, false);
+ ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+ }
+
+ // Compute the scalarization overhead of needed extractelement
+ // instructions. For each of the instruction's operands, if the operand can
+ // be scalarized, add it to the worklist; otherwise, account for the
+ // overhead.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get())) {
+ assert(VectorType::isValidElementType(J->getType()) &&
+ "Instruction has non-scalar type");
+ if (canBeScalarized(J))
+ Worklist.push_back(J);
+ else if (needsExtract(J, VF))
+ ScalarCost += TTI.getScalarizationOverhead(
+ ToVectorTy(J->getType(),VF), false, true);
+ }
+
+ // Scale the total scalar cost by block probability.
+ ScalarCost /= getReciprocalPredBlockProb();
+
+ // Compute the discount. A non-negative discount means the vector version
+ // of the instruction costs more, and scalarizing would be beneficial.
+ Discount += VectorCost - ScalarCost;
+ ScalarCosts[I] = ScalarCost;
+ }
+
+ return Discount;
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::expectedCost(unsigned VF) {
+ VectorizationCostTy Cost;
+
+ // For each block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ VectorizationCostTy BlockCost;
+
+ // For each instruction in the old loop.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ // Skip ignored values.
+ if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
+ (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
+ continue;
+
+ VectorizationCostTy C = getInstructionCost(&I, VF);
+
+ // Check if we should override the cost.
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ C.first = ForceTargetInstructionCost;
+
+ BlockCost.first += C.first;
+ BlockCost.second |= C.second;
+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+ << " for VF " << VF << " For instruction: " << I
+ << '\n');
+ }
+
+ // If we are vectorizing a predicated block, it will have been
+ // if-converted. This means that the block's instructions (aside from
+ // stores and instructions that may divide by zero) will now be
+ // unconditionally executed. For the scalar case, we may not always execute
+ // the predicated block. Thus, scale the block's cost by the probability of
+ // executing it.
+ if (VF == 1 && blockNeedsPredication(BB))
+ BlockCost.first /= getReciprocalPredBlockProb();
+
+ Cost.first += BlockCost.first;
+ Cost.second |= BlockCost.second;
+ }
+
+ return Cost;
+}
+
+/// Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
+///
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+ Value *Ptr,
+ LoopVectorizationLegality *Legal,
+ PredicatedScalarEvolution &PSE,
+ const Loop *TheLoop) {
+
+ auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!Gep)
+ return nullptr;
+
+ // We are looking for a gep with all loop invariant indices except for one
+ // which should be an induction variable.
+ auto SE = PSE.getSE();
+ unsigned NumOperands = Gep->getNumOperands();
+ for (unsigned i = 1; i < NumOperands; ++i) {
+ Value *Opd = Gep->getOperand(i);
+ if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+ !Legal->isInductionVariable(Opd))
+ return nullptr;
+ }
+
+ // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+ return PSE.getSCEV(Ptr);
+}
+
+static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
+ return Legal->hasStride(I->getOperand(0)) ||
+ Legal->hasStride(I->getOperand(1));
+}
+
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+ unsigned VF) {
+ assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
+ Type *ValTy = getMemInstValueType(I);
+ auto SE = PSE.getSE();
+
+ unsigned AS = getLoadStoreAddressSpace(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
+ Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+ // Figure out whether the access is strided and get the stride value
+ // if it's known in compile time
+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
+
+ // Get the cost of the scalar memory instruction and address computation.
+ unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+ // Don't pass *I here, since it is scalar but will actually be part of a
+ // vectorized loop where the user of it is a vectorized instruction.
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+ Alignment ? Alignment->value() : 0, AS);
+
+ // Get the overhead of the extractelement and insertelement instructions
+ // we might create due to scalarization.
+ Cost += getScalarizationOverhead(I, VF);
+
+ // If we have a predicated store, it may not be executed for each vector
+ // lane. Scale the cost by the probability of executing the predicated
+ // block.
+ if (isPredicatedInst(I)) {
+ Cost /= getReciprocalPredBlockProb();
+
+ if (useEmulatedMaskMemRefHack(I))
+ // Artificially setting to a high enough value to practically disable
+ // vectorization with such operations.
+ Cost = 3000000;
+ }
+
+ return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+ unsigned VF) {
+ Type *ValTy = getMemInstValueType(I);
+ Type *VectorTy = ToVectorTy(ValTy, VF);
+ Value *Ptr = getLoadStorePointerOperand(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Stride should be 1 or -1 for consecutive memory access");
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ unsigned Cost = 0;
+ if (Legal->isMaskRequired(I))
+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
+ Alignment ? Alignment->value() : 0, AS);
+ else
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
+ Alignment ? Alignment->value() : 0, AS, I);
+
+ bool Reverse = ConsecutiveStride < 0;
+ if (Reverse)
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+ unsigned VF) {
+ Type *ValTy = getMemInstValueType(I);
+ Type *VectorTy = ToVectorTy(ValTy, VF);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ if (isa<LoadInst>(I)) {
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy,
+ Alignment ? Alignment->value() : 0, AS) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ }
+ StoreInst *SI = cast<StoreInst>(I);
+
+ bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy,
+ Alignment ? Alignment->value() : 0, AS) +
+ (isLoopInvariantStoreValue
+ ? 0
+ : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+ VF - 1));
+}
+
+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+ unsigned VF) {
+ Type *ValTy = getMemInstValueType(I);
+ Type *VectorTy = ToVectorTy(ValTy, VF);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getAddressComputationCost(VectorTy) +
+ TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I),
+ Alignment ? Alignment->value() : 0);
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+ unsigned VF) {
+ Type *ValTy = getMemInstValueType(I);
+ Type *VectorTy = ToVectorTy(ValTy, VF);
+ unsigned AS = getLoadStoreAddressSpace(I);
+
+ auto Group = getInterleavedAccessGroup(I);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ unsigned InterleaveFactor = Group->getFactor();
+ Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+ // Holds the indices of existing members in an interleaved load group.
+ // An interleaved store group doesn't need this as it doesn't allow gaps.
+ SmallVector<unsigned, 4> Indices;
+ if (isa<LoadInst>(I)) {
+ for (unsigned i = 0; i < InterleaveFactor; i++)
+ if (Group->getMember(i))
+ Indices.push_back(i);
+ }
+
+ // Calculate the cost of the whole interleaved group.
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+ unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+ Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
+
+ if (Group->isReverse()) {
+ // TODO: Add support for reversed masked interleaved access.
+ assert(!Legal->isMaskRequired(I) &&
+ "Reverse masked interleaved access not supported.");
+ Cost += Group->getNumMembers() *
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ }
+ return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+ unsigned VF) {
+ // Calculate scalar cost only. Vectorization cost should be ready at this
+ // moment.
+ if (VF == 1) {
+ Type *ValTy = getMemInstValueType(I);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy,
+ Alignment ? Alignment->value() : 0, AS, I);
+ }
+ return getWideningCost(I, VF);
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+ // If we know that this instruction will remain uniform, check the cost of
+ // the scalar version.
+ if (isUniformAfterVectorization(I, VF))
+ VF = 1;
+
+ if (VF > 1 && isProfitableToScalarize(I, VF))
+ return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
+ // Forced scalars do not have any scalarization overhead.
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
+ auto InstSet = ForcedScalar->second;
+ if (InstSet.find(I) != InstSet.end())
+ return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+ }
+
+ Type *VectorTy;
+ unsigned C = getInstructionCost(I, VF, VectorTy);
+
+ bool TypeNotScalarized =
+ VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+ return VectorizationCostTy(C, TypeNotScalarized);
+}
+
+unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ unsigned VF) {
+
+ if (VF == 1)
+ return 0;
+
+ unsigned Cost = 0;
+ Type *RetTy = ToVectorTy(I->getType(), VF);
+ if (!RetTy->isVoidTy() &&
+ (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
+ Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+
+ // Some targets keep addresses scalar.
+ if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+ return Cost;
+
+ // Some targets support efficient element stores.
+ if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+ return Cost;
+
+ // Collect operands to consider.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+ // Skip operands that do not require extraction/scalarization and do not incur
+ // any overhead.
+ return Cost + TTI.getOperandsScalarizationOverhead(
+ filterExtractingOperands(Ops, VF), VF);
+}
+
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+ if (VF == 1)
+ return;
+ NumPredStores = 0;
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the old loop.
+ for (Instruction &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+
+ // TODO: We should generate better code and update the cost model for
+ // predicated uniform stores. Today they are treated as any other
+ // predicated store (see added test cases in
+ // invariant-store-vectorization.ll).
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ NumPredStores++;
+
+ if (Legal->isUniform(Ptr) &&
+ // Conditional loads and stores should be scalarized and predicated.
+ // isScalarWithPredication cannot be used here since masked
+ // gather/scatters are not considered scalar with predication.
+ !Legal->blockNeedsPredication(I.getParent())) {
+ // TODO: Avoid replicating loads and stores instead of
+ // relying on instcombine to remove them.
+ // Load: Scalar load + broadcast
+ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
+ unsigned Cost = getUniformMemOpCost(&I, VF);
+ setWideningDecision(&I, VF, CM_Scalarize, Cost);
+ continue;
+ }
+
+ // We assume that widening is the best solution when possible.
+ if (memoryInstructionCanBeWidened(&I, VF)) {
+ unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+ int ConsecutiveStride =
+ Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Expected consecutive stride.");
+ InstWidening Decision =
+ ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ setWideningDecision(&I, VF, Decision, Cost);
+ continue;
+ }
+
+ // Choose between Interleaving, Gather/Scatter or Scalarization.
+ unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
+ unsigned NumAccesses = 1;
+ if (isAccessInterleaved(&I)) {
+ auto Group = getInterleavedAccessGroup(&I);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ // Make one decision for the whole group.
+ if (getWideningDecision(&I, VF) != CM_Unknown)
+ continue;
+
+ NumAccesses = Group->getNumMembers();
+ if (interleavedAccessCanBeWidened(&I, VF))
+ InterleaveCost = getInterleaveGroupCost(&I, VF);
+ }
+
+ unsigned GatherScatterCost =
+ isLegalGatherOrScatter(&I)
+ ? getGatherScatterCost(&I, VF) * NumAccesses
+ : std::numeric_limits<unsigned>::max();
+
+ unsigned ScalarizationCost =
+ getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+ // Choose better solution for the current VF,
+ // write down this decision and use it during vectorization.
+ unsigned Cost;
+ InstWidening Decision;
+ if (InterleaveCost <= GatherScatterCost &&
+ InterleaveCost < ScalarizationCost) {
+ Decision = CM_Interleave;
+ Cost = InterleaveCost;
+ } else if (GatherScatterCost < ScalarizationCost) {
+ Decision = CM_GatherScatter;
+ Cost = GatherScatterCost;
+ } else {
+ Decision = CM_Scalarize;
+ Cost = ScalarizationCost;
+ }
+ // If the instructions belongs to an interleave group, the whole group
+ // receives the same decision. The whole group receives the cost, but
+ // the cost will actually be assigned to one instruction.
+ if (auto Group = getInterleavedAccessGroup(&I))
+ setWideningDecision(Group, VF, Decision, Cost);
+ else
+ setWideningDecision(&I, VF, Decision, Cost);
+ }
+ }
+
+ // Make sure that any load of address and any other address computation
+ // remains scalar unless there is gather/scatter support. This avoids
+ // inevitable extracts into address registers, and also has the benefit of
+ // activating LSR more, since that pass can't optimize vectorized
+ // addresses.
+ if (TTI.prefersVectorizedAddressing())
+ return;
+
+ // Start with all scalar pointer uses.
+ SmallPtrSet<Instruction *, 8> AddrDefs;
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ Instruction *PtrDef =
+ dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+ if (PtrDef && TheLoop->contains(PtrDef) &&
+ getWideningDecision(&I, VF) != CM_GatherScatter)
+ AddrDefs.insert(PtrDef);
+ }
+
+ // Add all instructions used to generate the addresses.
+ SmallVector<Instruction *, 4> Worklist;
+ for (auto *I : AddrDefs)
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ for (auto &Op : I->operands())
+ if (auto *InstOp = dyn_cast<Instruction>(Op))
+ if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+ AddrDefs.insert(InstOp).second)
+ Worklist.push_back(InstOp);
+ }
+
+ for (auto *I : AddrDefs) {
+ if (isa<LoadInst>(I)) {
+ // Setting the desired widening decision should ideally be handled in
+ // by cost functions, but since this involves the task of finding out
+ // if the loaded register is involved in an address computation, it is
+ // instead changed here when we know this is the case.
+ InstWidening Decision = getWideningDecision(I, VF);
+ if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
+ // Scalarize a widened load of address.
+ setWideningDecision(I, VF, CM_Scalarize,
+ (VF * getMemoryInstructionCost(I, 1)));
+ else if (auto Group = getInterleavedAccessGroup(I)) {
+ // Scalarize an interleave group of address loads.
+ for (unsigned I = 0; I < Group->getFactor(); ++I) {
+ if (Instruction *Member = Group->getMember(I))
+ setWideningDecision(Member, VF, CM_Scalarize,
+ (VF * getMemoryInstructionCost(Member, 1)));
+ }
+ }
+ } else
+ // Make sure I gets scalarized and a cost estimate without
+ // scalarization overhead.
+ ForcedScalars[VF].insert(I);
+ }
+}
+
+unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+ unsigned VF,
+ Type *&VectorTy) {
+ Type *RetTy = I->getType();
+ if (canTruncateToMinimalBitwidth(I, VF))
+ RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+ VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
+ auto SE = PSE.getSE();
+
+ // TODO: We need to estimate the cost of intrinsic calls.
+ switch (I->getOpcode()) {
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because the cost of GEPs in
+ // vectorized code depends on whether the corresponding memory instruction
+ // is scalarized or not. Therefore, we handle GEPs with the memory
+ // instruction cost.
+ return 0;
+ case Instruction::Br: {
+ // In cases of scalarized and predicated instructions, there will be VF
+ // predicated blocks in the vectorized loop. Each branch around these
+ // blocks requires also an extract of its vector compare i1 element.
+ bool ScalarPredicatedBB = false;
+ BranchInst *BI = cast<BranchInst>(I);
+ if (VF > 1 && BI->isConditional() &&
+ (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
+ PredicatedBBsAfterVectorization.end() ||
+ PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
+ PredicatedBBsAfterVectorization.end()))
+ ScalarPredicatedBB = true;
+
+ if (ScalarPredicatedBB) {
+ // Return cost for branches around scalarized and predicated blocks.
+ Type *Vec_i1Ty =
+ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+ return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+ (TTI.getCFInstrCost(Instruction::Br) * VF));
+ } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+ // The back-edge branch will remain, as will all scalar branches.
+ return TTI.getCFInstrCost(Instruction::Br);
+ else
+ // This branch will be eliminated by if-conversion.
+ return 0;
+ // Note: We currently assume zero cost for an unconditional branch inside
+ // a predicated block since it will become a fall-through, although we
+ // may decide in the future to call TTI for all branches.
+ }
+ case Instruction::PHI: {
+ auto *Phi = cast<PHINode>(I);
+
+ // First-order recurrences are replaced by vector shuffles inside the loop.
+ // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
+ if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
+ return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ VectorTy, VF - 1, VectorType::get(RetTy, 1));
+
+ // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
+ // converted into select instructions. We require N - 1 selects per phi
+ // node, where N is the number of incoming values.
+ if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+ return (Phi->getNumIncomingValues() - 1) *
+ TTI.getCmpSelInstrCost(
+ Instruction::Select, ToVectorTy(Phi->getType(), VF),
+ ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
+
+ return TTI.getCFInstrCost(Instruction::PHI);
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // If we have a predicated instruction, it may not be executed for each
+ // vector lane. Get the scalarization cost and scale this amount by the
+ // probability of executing the predicated block. If the instruction is not
+ // predicated, we fall through to the next case.
+ if (VF > 1 && isScalarWithPredication(I)) {
+ unsigned Cost = 0;
+
+ // These instructions have a non-void type, so account for the phi nodes
+ // that we will create. This cost is likely to be zero. The phi node
+ // cost, if any, should be scaled by the block probability because it
+ // models a copy at the end of each predicated block.
+ Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+
+ // The cost of the non-predicated instruction.
+ Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+
+ // The cost of insertelement and extractelement instructions needed for
+ // scalarization.
+ Cost += getScalarizationOverhead(I, VF);
+
+ // Scale the cost by the probability of executing the predicated blocks.
+ // This assumes the predicated block for each vector lane is equally
+ // likely.
+ return Cost / getReciprocalPredBlockProb();
+ }
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Since we will replace the stride by 1 the multiplication should go away.
+ if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+ return 0;
+ // Certain instructions can be cheaper to vectorize if they have a constant
+ // second vector operand. One example of this are shifts on x86.
+ Value *Op2 = I->getOperand(1);
+ TargetTransformInfo::OperandValueProperties Op2VP;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TTI.getOperandInfo(Op2, Op2VP);
+ if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ Op2VK = TargetTransformInfo::OK_UniformValue;
+
+ SmallVector<const Value *, 4> Operands(I->operand_values());
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ return N * TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
+ }
+ case Instruction::FNeg: {
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ return N * TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
+ I->getOperand(0));
+ }
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (!ScalarCond)
+ CondTy = VectorType::get(CondTy, VF);
+
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *ValTy = I->getOperand(0)->getType();
+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+ ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
+ VectorTy = ToVectorTy(ValTy, VF);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
+ }
+ case Instruction::Store:
+ case Instruction::Load: {
+ unsigned Width = VF;
+ if (Width > 1) {
+ InstWidening Decision = getWideningDecision(I, Width);
+ assert(Decision != CM_Unknown &&
+ "CM decision should be taken at this point");
+ if (Decision == CM_Scalarize)
+ Width = 1;
+ }
+ VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+ return getMemoryInstructionCost(I, VF);
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ // We optimize the truncation of induction variables having constant
+ // integer steps. The cost of these truncations is the same as the scalar
+ // operation.
+ if (isOptimizableIVTruncate(I, VF)) {
+ auto *Trunc = cast<TruncInst>(I);
+ return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+ Trunc->getSrcTy(), Trunc);
+ }
+
+ Type *SrcScalarTy = I->getOperand(0)->getType();
+ Type *SrcVecTy =
+ VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+ if (canTruncateToMinimalBitwidth(I, VF)) {
+ // This cast is going to be shrunk. This may remove the cast or it might
+ // turn it into slightly different cast. For example, if MinBW == 16,
+ // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+ //
+ // Calculate the modified src and dest types.
+ Type *MinVecTy = VectorTy;
+ if (I->getOpcode() == Instruction::Trunc) {
+ SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy =
+ largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+ } else if (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt) {
+ SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy =
+ smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+ }
+ }
+
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+ }
+ case Instruction::Call: {
+ bool NeedToScalarize;
+ CallInst *CI = cast<CallInst>(I);
+ unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ if (getVectorIntrinsicIDForCall(CI, TLI))
+ return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
+ return CallCost;
+ }
+ default:
+ // The cost of executing VF copies of the scalar instruction. This opcode
+ // is unknown. Assume that it is the same as 'mul'.
+ return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+ getScalarizationOverhead(I, VF);
+ } // end of switch.
+}
+
+char LoopVectorize::ID = 0;
+
+static const char lv_name[] = "Loop Vectorization";
+
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+
+Pass *createLoopVectorizePass() { return new LoopVectorize(); }
+
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
+ bool VectorizeOnlyWhenForced) {
+ return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
+}
+
+} // end namespace llvm
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+ // Check if the pointer operand of a load or store instruction is
+ // consecutive.
+ if (auto *Ptr = getLoadStorePointerOperand(Inst))
+ return Legal->isConsecutivePtr(Ptr);
+ return false;
+}
+
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+ // Ignore ephemeral values.
+ CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+ // Ignore type-promoting instructions we identified during reduction
+ // detection.
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ RecurrenceDescriptor &RedDes = Reduction.second;
+ SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+ // Ignore type-casting instructions we identified during induction
+ // detection.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+}
+
+// TODO: we could return a pair of values that specify the max VF and
+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
+// doesn't have a cost model that can choose which plan to execute if
+// more than one is generated.
+static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
+ LoopVectorizationCostModel &CM) {
+ unsigned WidestType;
+ std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
+ return WidestVectorRegBits / WidestType;
+}
+
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
+ unsigned VF = UserVF;
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->empty()) {
+ // If the user doesn't provide a vectorization factor, determine a
+ // reasonable one.
+ if (!UserVF) {
+ VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+ LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+ // Make sure we have a VF > 1 for stress testing.
+ if (VPlanBuildStressTest && VF < 2) {
+ LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+ << "overriding computed VF.\n");
+ VF = 4;
+ }
+ }
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
+ << " to build VPlans.\n");
+ buildVPlans(VF, VF);
+
+ // For VPlan build stress testing, we bail out after VPlan construction.
+ if (VPlanBuildStressTest)
+ return VectorizationFactor::Disabled();
+
+ return {VF, 0};
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return VectorizationFactor::Disabled();
+}
+
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
+ assert(OrigLoop->empty() && "Inner loop expected.");
+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
+ if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+ return None;
+
+ // Invalidate interleave groups if all blocks of loop will be predicated.
+ if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+ !useMaskedInterleavedAccesses(*TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+ "which requires masked-interleaved support.\n");
+ CM.InterleaveInfo.reset();
+ }
+
+ if (UserVF) {
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ CM.selectUserVectorizationFactor(UserVF);
+ buildVPlansWithVPRecipes(UserVF, UserVF);
+ LLVM_DEBUG(printPlans(dbgs()));
+ return {{UserVF, 0}};
+ }
+
+ unsigned MaxVF = MaybeMaxVF.getValue();
+ assert(MaxVF != 0 && "MaxVF is zero.");
+
+ for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ CM.collectUniformsAndScalars(VF);
+
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ if (VF > 1)
+ CM.collectInstsToScalarize(VF);
+ }
+
+ buildVPlansWithVPRecipes(1, MaxVF);
+ LLVM_DEBUG(printPlans(dbgs()));
+ if (MaxVF == 1)
+ return VectorizationFactor::Disabled();
+
+ // Select the optimal vectorization factor.
+ return CM.selectVectorizationFactor(MaxVF);
+}
+
+void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+ LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+ << '\n');
+ BestVF = VF;
+ BestUF = UF;
+
+ erase_if(VPlans, [VF](const VPlanPtr &Plan) {
+ return !Plan->hasVF(VF);
+ });
+ assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+ DominatorTree *DT) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ VPCallbackILV CallbackILV(ILV);
+
+ VPTransformState State{BestVF, BestUF, LI,
+ DT, ILV.Builder, ILV.VectorLoopValueMap,
+ &ILV, CallbackILV};
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+ State.TripCount = ILV.getOrCreateTripCount(nullptr);
+
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+ VPlans.front()->execute(&State);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
+ DeadInstructions.end();
+ }))
+ DeadInstructions.insert(IndUpdate);
+
+ // We record as "Dead" also the type-casting instructions we had identified
+ // during induction analysis. We don't need any handling for them in the
+ // vectorized loop because we have proven that, under a proper runtime
+ // test guarding the vectorized loop, the value of the phi, and the casted
+ // value of the phi, are the same. The last instruction in this casting chain
+ // will get its scalar/vector/widened def from the scalar/vector/widened def
+ // of the respective phi node. Any other casts in the induction def-use chain
+ // have no other uses outside the phi update chain, and will be ignored.
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ DeadInstructions.insert(Casts.begin(), Casts.end());
+ }
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
+
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
+ // When unrolling and the VF is 1, we only need to add a simple scalar.
+ Type *Ty = Val->getType();
+ assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+ if (Ty->isFloatingPointTy()) {
+ Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+ // Floating point operations had to be 'fast' to enable the unrolling.
+ Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+ return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+ }
+ Constant *C = ConstantInt::get(Ty, StartIdx);
+ return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+}
+
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
+ const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
+ assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ bool PredicateAtRangeStart = Predicate(Range.Start);
+
+ for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ if (Predicate(TmpVF) != PredicateAtRangeStart) {
+ Range.End = TmpVF;
+ break;
+ }
+
+ return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(buildVPlan(SubRange));
+ VF = SubRange.End;
+ }
+}
+
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+ VPlanPtr &Plan) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ if (ECEntryIt != EdgeMaskCache.end())
+ return ECEntryIt->second;
+
+ VPValue *SrcMask = createBlockInMask(Src, Plan);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ if (!BI->isConditional())
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.createNot(EdgeMask);
+
+ if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
+ EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+
+ return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Look for cached value.
+ BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+ if (BCEntryIt != BlockMaskCache.end())
+ return BCEntryIt->second;
+
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+
+ if (OrigLoop->getHeader() == BB) {
+ if (!CM.blockNeedsPredication(BB))
+ return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC.
+ VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ return BlockMaskCache[BB] = BlockMask;
+ }
+
+ // This is the block mask. We OR all incoming edges.
+ for (auto *Predecessor : predecessors(BB)) {
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+ if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
+ return BlockMaskCache[BB] = EdgeMask;
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ }
+
+ return BlockMaskCache[BB] = BlockMask;
+}
+
+VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range,
+ VPlanPtr &Plan) {
+ const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
+ if (!IG)
+ return nullptr;
+
+ // Now check if IG is relevant for VF's in the given range.
+ auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
+ return [=](unsigned VF) -> bool {
+ return (VF >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ };
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
+ return nullptr;
+
+ // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
+ // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
+ // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
+ assert(I == IG->getInsertPos() &&
+ "Generating a recipe for an adjunct member of an interleave group");
+
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPInterleaveRecipe(IG, Mask);
+}
+
+VPWidenMemoryInstructionRecipe *
+VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return nullptr;
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (VF == 1)
+ return false;
+ if (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF))
+ return false;
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point.");
+ assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+ "Interleave memory opportunity should be caught earlier.");
+ return Decision != LoopVectorizationCostModel::CM_Scalarize;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPWidenMemoryInstructionRecipe(*I, Mask);
+}
+
+VPWidenIntOrFpInductionRecipe *
+VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
+ if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction)
+ return new VPWidenIntOrFpInductionRecipe(Phi);
+
+ return nullptr;
+ }
+
+ // Optimize the special case where the source is a constant integer
+ // induction variable. Notice that we can only optimize the 'trunc' case
+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+ // (c) other casts depend on pointer size.
+
+ // Determine whether \p K is a truncation based on an induction variable that
+ // can be optimized.
+ auto isOptimizableIVTruncate =
+ [&](Instruction *K) -> std::function<bool(unsigned)> {
+ return
+ [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ };
+
+ if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
+ isOptimizableIVTruncate(I), Range))
+ return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
+ cast<TruncInst>(I));
+ return nullptr;
+}
+
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi || Phi->getParent() == OrigLoop->getHeader())
+ return nullptr;
+
+ // We know that all PHIs in non-header blocks are converted into selects, so
+ // we don't have to worry about the insertion order and we can just use the
+ // builder. At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ SmallVector<VPValue *, 2> Masks;
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VPValue *EdgeMask =
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ assert((EdgeMask || NumIncoming == 1) &&
+ "Multiple predecessors with one having a full mask");
+ if (EdgeMask)
+ Masks.push_back(EdgeMask);
+ }
+ return new VPBlendRecipe(Phi, Masks);
+}
+
+bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+ VFRange &Range) {
+
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
+ if (IsPredicated)
+ return false;
+
+ auto IsVectorizableOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::AShr:
+ case Instruction::BitCast:
+ case Instruction::Br:
+ case Instruction::Call:
+ case Instruction::FAdd:
+ case Instruction::FCmp:
+ case Instruction::FDiv:
+ case Instruction::FMul:
+ case Instruction::FNeg:
+ case Instruction::FPExt:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::FPTrunc:
+ case Instruction::FRem:
+ case Instruction::FSub:
+ case Instruction::GetElementPtr:
+ case Instruction::ICmp:
+ case Instruction::IntToPtr:
+ case Instruction::Load:
+ case Instruction::LShr:
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::PHI:
+ case Instruction::PtrToInt:
+ case Instruction::SDiv:
+ case Instruction::Select:
+ case Instruction::SExt:
+ case Instruction::Shl:
+ case Instruction::SIToFP:
+ case Instruction::SRem:
+ case Instruction::Store:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::UDiv:
+ case Instruction::UIToFP:
+ case Instruction::URem:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ return true;
+ }
+ return false;
+ };
+
+ if (!IsVectorizableOpcode(I->getOpcode()))
+ return false;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ return false;
+ }
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF)))
+ return false;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+ return UseVectorIntrinsic || !NeedToScalarize;
+ }
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ assert(CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Scalarize &&
+ "Memory widening decisions should have been taken care by now");
+ return false;
+ }
+ return true;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+ return false;
+
+ // Success: widen this instruction. We optimize the common case where
+ // consecutive instructions can be represented by a single recipe.
+ if (!VPBB->empty()) {
+ VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
+ if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
+ return true;
+ }
+
+ VPBB->appendRecipe(new VPWidenRecipe(I));
+ return true;
+}
+
+VPBasicBlock *VPRecipeBuilder::handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan) {
+ bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ Range);
+
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
+ auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+
+ // Find if I uses a predicated instruction. If so, it will use its scalar
+ // value. Avoid hoisting the insert-element which packs the scalar value into
+ // a vector value, as that happens iff all users use the vector value.
+ for (auto &Op : I->operands())
+ if (auto *PredInst = dyn_cast<Instruction>(Op))
+ if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+ PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+ // Finalize the recipe for Instr, first if it is not predicated.
+ if (!IsPredicated) {
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ VPBB->appendRecipe(Recipe);
+ return VPBB;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ assert(VPBB->getSuccessors().empty() &&
+ "VPBB has successors when handling predicated replication.");
+ // Record predicated instructions for above packing optimizations.
+ PredInst2Recipe[I] = Recipe;
+ VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+ VPBlockUtils::insertBlockAfter(Region, VPBB);
+ auto *RegSucc = new VPBasicBlock();
+ VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ return RegSucc;
+}
+
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
+ // Instructions marked for predication are replicated and placed under an
+ // if-then construct to prevent side-effects.
+
+ // Generate recipes to compute the block mask for this region.
+ VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
+
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *PHIRecipe =
+ Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exit);
+
+ return Region;
+}
+
+bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
+ VPlanPtr &Plan, VPBasicBlock *VPBB) {
+ VPRecipeBase *Recipe = nullptr;
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr is a memory operation that should be widened.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if ((Recipe = tryToBlend(Instr, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ return true;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if (tryToWiden(Instr, VPBB, Range))
+ return true;
+
+ return false;
+}
+
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
+ unsigned MaxVF) {
+ assert(OrigLoop->empty() && "Inner loop expected.");
+
+ // Collect conditions feeding internal conditional branches; they need to be
+ // represented in VPlan for it to model masking.
+ SmallPtrSet<Value *, 1> NeedDef;
+
+ auto *Latch = OrigLoop->getLoopLatch();
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == Latch)
+ continue;
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Branch && Branch->isConditional())
+ NeedDef.insert(Branch->getCondition());
+ }
+
+ // If the tail is to be folded by masking, the primary induction variable
+ // needs to be represented in VPlan for it to model early-exit masking.
+ // Also, both the Phi and the live-out instruction of each reduction are
+ // required in order to introduce a select between them in VPlan.
+ if (CM.foldTailByMasking()) {
+ NeedDef.insert(Legal->getPrimaryInduction());
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ NeedDef.insert(Reduction.first);
+ NeedDef.insert(Reduction.second.getLoopExitInstr());
+ }
+ }
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(
+ buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+ VF = SubRange.End;
+ }
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ // Hold a mapping from predicated instructions to their recipes, in order to
+ // fix their AlsoPack behavior if a user is determined to replicate and use a
+ // scalar instead of vector value.
+ DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
+ // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+ auto Plan = std::make_unique<VPlan>(VPBB);
+
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
+ // Represent values that will have defs inside VPlan.
+ for (Value *V : NeedDef)
+ Plan->addVPValue(V);
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ // Relevant instructions from basic block BB will be grouped into VPRecipe
+ // ingredients and fill a new VPBasicBlock.
+ unsigned VPBBsForBB = 0;
+ auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+ VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+ VPBB = FirstVPBBForBB;
+ Builder.setInsertPoint(VPBB);
+
+ std::vector<Instruction *> Ingredients;
+
+ // Organize the ingredients to vectorize from current basic block in the
+ // right order.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ Instruction *Instr = &I;
+
+ // First filter out irrelevant instructions, to ensure no recipes are
+ // built for them.
+ if (isa<BranchInst>(Instr) ||
+ DeadInstructions.find(Instr) != DeadInstructions.end())
+ continue;
+
+ // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
+ // member of the IG, do not construct any Recipe for it.
+ const InterleaveGroup<Instruction> *IG =
+ CM.getInterleavedAccessGroup(Instr);
+ if (IG && Instr != IG->getInsertPos() &&
+ Range.Start >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(Instr, Range.Start) ==
+ LoopVectorizationCostModel::CM_Interleave) {
+ auto SinkCandidate = SinkAfterInverse.find(Instr);
+ if (SinkCandidate != SinkAfterInverse.end())
+ Ingredients.push_back(SinkCandidate->second);
+ continue;
+ }
+
+ // Move instructions to handle first-order recurrences, step 1: avoid
+ // handling this instruction until after we've handled the instruction it
+ // should follow.
+ auto SAIt = SinkAfter.find(Instr);
+ if (SAIt != SinkAfter.end()) {
+ LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
+ << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
+ SinkAfterInverse[SAIt->second] = Instr;
+ continue;
+ }
+
+ Ingredients.push_back(Instr);
+
+ // Move instructions to handle first-order recurrences, step 2: push the
+ // instruction to be sunk at its insertion point.
+ auto SAInvIt = SinkAfterInverse.find(Instr);
+ if (SAInvIt != SinkAfterInverse.end())
+ Ingredients.push_back(SAInvIt->second);
+ }
+
+ // Introduce each ingredient into VPlan.
+ for (Instruction *Instr : Ingredients) {
+ if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
+ continue;
+
+ // Otherwise, if all widening options failed, Instruction is to be
+ // replicated. This may create a successor for VPBB.
+ VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+ Instr, Range, VPBB, PredInst2Recipe, Plan);
+ if (NextVPBB != VPBB) {
+ VPBB = NextVPBB;
+ VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+ : "");
+ }
+ }
+ }
+
+ // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+ // may also be empty, such as the last one VPBB, reflecting original
+ // basic-blocks with no recipes.
+ VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+ assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+ VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+ VPBlockUtils::disconnectBlocks(PreEntry, Entry);
+ delete PreEntry;
+
+ // Finally, if tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the end of the latch.
+ if (CM.foldTailByMasking()) {
+ Builder.setInsertPoint(VPBB);
+ auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ VPValue *Phi = Plan->getVPValue(Reduction.first);
+ VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+ Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+ }
+ }
+
+ std::string PlanName;
+ raw_string_ostream RSO(PlanName);
+ unsigned VF = Range.Start;
+ Plan->addVF(VF);
+ RSO << "Initial VPlan for VF={" << VF;
+ for (VF *= 2; VF < Range.End; VF *= 2) {
+ Plan->addVF(VF);
+ RSO << "," << VF;
+ }
+ RSO << "},UF>=1";
+ RSO.flush();
+ Plan->setName(PlanName);
+
+ return Plan;
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ assert(!OrigLoop->empty());
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ // Create new empty VPlan
+ auto Plan = std::make_unique<VPlan>();
+
+ // Build hierarchical CFG
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+ HCFGBuilder.buildHierarchicalCFG();
+
+ for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+ Plan->addVF(VF);
+
+ if (EnableVPlanPredication) {
+ VPlanPredicator VPP(*Plan);
+ VPP.predicate();
+
+ // Avoid running transformation to recipes until masked code generation in
+ // VPlan-native path is in place.
+ return Plan;
+ }
+
+ SmallPtrSet<Instruction *, 1> DeadInstructions;
+ VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ Plan, Legal->getInductionVars(), DeadInstructions);
+
+ return Plan;
+}
+
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+ return ILV.getOrCreateVectorValue(V, Part);
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ if (User) {
+ O << ", ";
+ User->getOperand(0)->printAsOperand(O);
+ }
+ O << "\\l\"";
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i))
+ O << " +\n"
+ << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
+ for (auto &Instr : make_range(Begin, End))
+ State.ILV->widenInstruction(Instr);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+ State.ILV->widenIntOrFpInduction(IV, Trunc);
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
+ State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+}
+
+void VPBlendRecipe::execute(VPTransformState &State) {
+ State.ILV->setDebugLocFromInst(State.Builder, Phi);
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+
+ assert((User || NumIncoming == 1) &&
+ "Multiple predecessors with predecessors having a full mask");
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ InnerLoopVectorizer::VectorParts Entry(State.UF);
+ for (unsigned In = 0; In < NumIncoming; ++In) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ Value *In0 =
+ State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+ if (In == 0)
+ Entry[Part] = In0; // Initialize with the first incoming value.
+ else {
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Value *Cond = State.get(User->getOperand(In), Part);
+ Entry[Part] =
+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+ }
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+}
+
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Interleave group being replicated.");
+ if (!User)
+ return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
+}
+
+void VPReplicateRecipe::execute(VPTransformState &State) {
+ if (State.Instance) { // Generate a single instance.
+ State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+ // Insert scalar instance packing it into a vector.
+ if (AlsoPack && State.VF > 1) {
+ // If we're constructing lane 0, initialize to start from undef.
+ if (State.Instance->Lane == 0) {
+ Value *Undef =
+ UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+ State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ }
+ State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ }
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts, unless the
+ // instruction is uniform inwhich case generate only the first lane for each
+ // of the UF parts.
+ unsigned EndLane = IsUniform ? 1 : State.VF;
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ for (unsigned Lane = 0; Lane < EndLane; ++Lane)
+ State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane;
+
+ Value *ConditionBit = nullptr;
+ if (!User) // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+ else {
+ VPValue *BlockInMask = User->getOperand(0);
+ ConditionBit = State.get(BlockInMask, Part);
+ if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+ }
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Predicated instruction PHI works per instance.");
+ Instruction *ScalarPredInst = cast<Instruction>(
+ State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+ BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+ assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+ // By current pack/unpack logic we need to generate only a single phi node: if
+ // a vector value for the predicated instruction exists at this point it means
+ // the instruction has vector users only, and a phi for the vector value is
+ // needed. In this case the recipe of the predicated instruction is marked to
+ // also do that packing, thereby "hoisting" the insert-element sequence.
+ // Otherwise, a phi node for the scalar value is needed.
+ unsigned Part = State.Instance->Part;
+ if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+ Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+ PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+ VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+ VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+ State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ } else {
+ Type *PredInstType = PredInst->getType();
+ PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+ Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(ScalarPredInst, PredicatedBB);
+ State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ }
+}
+
+void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
+ if (!User)
+ return State.ILV->vectorizeMemoryInstruction(&Instr);
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
+}
+
+static ScalarEpilogueLowering
+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
+ else if (PreferPredicateOverEpilog || Hints.getPredicate())
+ SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+
+ return SEL;
+}
+
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+ // Get user vectorization factor.
+ const unsigned UserVF = Hints.getWidth();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+
+ // If we are stress testing VPlan builds, do not attempt to generate vector
+ // code. Masked vector code generation support will follow soon.
+ // Also, do not attempt to vectorize if no vector code will be produced.
+ if (VPlanBuildStressTest || EnableVPlanPredication ||
+ VectorizationFactor::Disabled() == VF)
+ return false;
+
+ LVP.setBestPlan(VF.Width, 1);
+
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
+ &CM);
+ LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+ << L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.executePlan(LB, DT);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+
+ LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ return true;
+}
+
+bool LoopVectorizePass::processLoop(Loop *L) {
+ assert((EnableVPlanNativePath || L->empty()) &&
+ "VPlan-native path is not enabled. Only process inner loops.");
+
+#ifndef NDEBUG
+ const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << DebugLocStr << "\n");
+
+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop hints:"
+ << " force="
+ << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+ ? "disabled"
+ : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+ ? "enabled"
+ : "?"))
+ << " width=" << Hints.getWidth()
+ << " unroll=" << Hints.getInterleave() << "\n");
+
+ // Function containing loop
+ Function *F = L->getHeader()->getParent();
+
+ // Looking at the diagnostic output is the only way to determine if a loop
+ // was vectorized (other than looking at the IR or machine code), so it
+ // is important to generate an optimization remark for each loop. Most of
+ // these messages are generated as OptimizationRemarkAnalysis. Remarks
+ // generated as OptimizationRemark and OptimizationRemarkMissed are
+ // less verbose reporting vectorized loops and unvectorized loops that may
+ // benefit from vectorization, respectively.
+
+ if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+ return false;
+ }
+
+ PredicatedScalarEvolution PSE(*SE, *L);
+
+ // Check if it is legal to vectorize the loop.
+ LoopVectorizationRequirements Requirements(*ORE);
+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
+ &Requirements, &Hints, DB, AC);
+ if (!LVL.canVectorize(EnableVPlanNativePath)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ // Check the function attributes and profiles to find out if this function
+ // should be optimized for size.
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
+ if (!L->empty())
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, BFI, PSI, Hints);
+
+ assert(L->empty() && "Inner loop expected.");
+
+ // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+ // count by optimizing for size, to minimize overheads.
+ auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+ if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+ LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+ << "This loop is worth vectorizing only if no scalar "
+ << "iteration overheads are incurred.");
+ if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+ LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+ else {
+ LLVM_DEBUG(dbgs() << "\n");
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ }
+ }
+
+ // Check the function attributes to see if implicit floats are allowed.
+ // FIXME: This check doesn't seem possibly correct -- what if the loop is
+ // an integer loop and the vector instructions selected are purely integer
+ // vector instructions?
+ if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ reportVectorizationFailure(
+ "Can't vectorize when the NoImplicitFloat attribute is used",
+ "loop not vectorized due to NoImplicitFloat attribute",
+ "NoImplicitFloat", ORE, L);
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ // Check if the target supports potentially unsafe FP vectorization.
+ // FIXME: Add a check for the type of safety issue (denormal, signaling)
+ // for the target we're vectorizing for, to make sure none of the
+ // additional fp-math flags can help.
+ if (Hints.isPotentiallyUnsafe() &&
+ TTI->isFPVectorizationPotentiallyUnsafe()) {
+ reportVectorizationFailure(
+ "Potentially unsafe FP op prevents vectorization",
+ "loop not vectorized due to unsafe FP support.",
+ "UnsafeFP", ORE, L);
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved) {
+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+ }
+
+ // Use the cost model.
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+ F, &Hints, IAI);
+ CM.collectValuesToIgnore();
+
+ // Use the planner for vectorization.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
+
+ // Get user vectorization factor.
+ unsigned UserVF = Hints.getWidth();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
+
+ VectorizationFactor VF = VectorizationFactor::Disabled();
+ unsigned IC = 1;
+ unsigned UserIC = Hints.getInterleave();
+
+ if (MaybeVF) {
+ VF = *MaybeVF;
+ // Select the interleave count.
+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+ }
+
+ // Identify the diagnostic messages that should be produced.
+ std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
+ bool VectorizeLoop = true, InterleaveLoop = true;
+ if (Requirements.doesNotMeet(F, L, Hints)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ if (VF.Width == 1) {
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ VecDiagMsg = std::make_pair(
+ "VectorizationNotBeneficial",
+ "the cost-model indicates that vectorization is not beneficial");
+ VectorizeLoop = false;
+ }
+
+ if (!MaybeVF && UserIC > 1) {
+ // Tell the user interleaving was avoided up-front, despite being explicitly
+ // requested.
+ LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
+ "interleaving should be avoided up front\n");
+ IntDiagMsg = std::make_pair(
+ "InterleavingAvoided",
+ "Ignoring UserIC, because interleaving was avoided up front");
+ InterleaveLoop = false;
+ } else if (IC == 1 && UserIC <= 1) {
+ // Tell the user interleaving is not beneficial.
+ LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ IntDiagMsg = std::make_pair(
+ "InterleavingNotBeneficial",
+ "the cost-model indicates that interleaving is not beneficial");
+ InterleaveLoop = false;
+ if (UserIC == 1) {
+ IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
+ IntDiagMsg.second +=
+ " and is explicitly disabled or interleave count is set to 1";
+ }
+ } else if (IC > 1 && UserIC == 1) {
+ // Tell the user interleaving is beneficial, but it explicitly disabled.
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
+ IntDiagMsg = std::make_pair(
+ "InterleavingBeneficialButDisabled",
+ "the cost-model indicates that interleaving is beneficial "
+ "but is explicitly disabled or interleave count is set to 1");
+ InterleaveLoop = false;
+ }
+
+ // Override IC if user provided an interleave count.
+ IC = UserIC > 0 ? UserIC : IC;
+
+ // Emit diagnostic messages, if any.
+ const char *VAPassName = Hints.vectorizeAnalysisPassName();
+ if (!VectorizeLoop && !InterleaveLoop) {
+ // Do not vectorize or interleaving the loop.
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
+ return false;
+ } else if (!VectorizeLoop && InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ } else if (VectorizeLoop && !InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
+ } else if (VectorizeLoop && InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ }
+
+ LVP.setBestPlan(VF.Width, IC);
+
+ using namespace ore;
+ bool DisableRuntimeUnroll = false;
+ MDNode *OrigLoopID = L->getLoopID();
+
+ if (!VectorizeLoop) {
+ assert(IC > 1 && "interleave count should not be 1 or 0");
+ // If we decided that it is not legal to vectorize the loop, then
+ // interleave it.
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
+ &CM);
+ LVP.executePlan(Unroller, DT);
+
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+ } else {
+ // If we decided that it is *legal* to vectorize the loop, then do it.
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+ &LVL, &CM);
+ LVP.executePlan(LB, DT);
+ ++LoopsVectorized;
+
+ // Add metadata to disable runtime unrolling a scalar loop when there are
+ // no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ if (!LB.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+
+ // Report the vectorization decision.
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+ });
+ }
+
+ Optional<MDNode *> RemainderLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupEpilogue});
+ if (RemainderLoopID.hasValue()) {
+ L->setLoopID(RemainderLoopID.getValue());
+ } else {
+ if (DisableRuntimeUnroll)
+ AddRuntimeUnrollDisableMetaData(L);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+ }
+
+ LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ return true;
+}
+
+bool LoopVectorizePass::runImpl(
+ Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
+ DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+ DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+ OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
+ SE = &SE_;
+ LI = &LI_;
+ TTI = &TTI_;
+ DT = &DT_;
+ BFI = &BFI_;
+ TLI = TLI_;
+ AA = &AA_;
+ AC = &AC_;
+ GetLAA = &GetLAA_;
+ DB = &DB_;
+ ORE = &ORE_;
+ PSI = PSI_;
+
+ // Don't attempt if
+ // 1. the target claims to have no vector registers, and
+ // 2. interleaving won't help ILP.
+ //
+ // The second condition is necessary because, even if the target has no
+ // vector registers, loop vectorization may still enable scalar
+ // interleaving.
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ TTI->getMaxInterleaveFactor(1) < 2)
+ return false;
+
+ bool Changed = false;
+
+ // The vectorizer requires loops to be in simplified form.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop vectorizer
+ // will simplify all loops, regardless of whether anything end up being
+ // vectorized.
+ for (auto &L : *LI)
+ Changed |=
+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+
+ // Build up a worklist of inner-loops to vectorize. This is necessary as
+ // the act of vectorizing or partially unrolling a loop creates new loops
+ // and can invalidate iterators across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *L : *LI)
+ collectSupportedLoops(*L, LI, ORE, Worklist);
+
+ LoopsAnalyzed += Worklist.size();
+
+ // Now walk the identified inner loops.
+ while (!Worklist.empty()) {
+ Loop *L = Worklist.pop_back_val();
+
+ // For the inner loops we actually process, form LCSSA to simplify the
+ // transform.
+ Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+ Changed |= processLoop(L);
+ }
+
+ // Process each loop nest in the function.
+ return Changed;
+}
+
+PreservedAnalyses LoopVectorizePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+ : nullptr;
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & {
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ };
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ ProfileSummaryInfo *PSI =
+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ bool Changed =
+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ }
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 000000000000..974eff9974d9
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,7147 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using namespace slpvectorizer;
+
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
+cl::opt<bool>
+ llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<int>
+ SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Only vectorize if you gain more than this "
+ "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+ cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+ "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Attempt to vectorize horizontal reductions feeding into a store"));
+
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+ cl::desc("Limit the size of the SLP scheduling region per block"));
+
+static cl::opt<int> MinVectorRegSizeOption(
+ "slp-min-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
+static cl::opt<unsigned> RecursionMaxDepth(
+ "slp-recursion-max-depth", cl::init(12), cl::Hidden,
+ cl::desc("Limit the recursion depth when building a vectorizable tree"));
+
+static cl::opt<unsigned> MinTreeSize(
+ "slp-min-tree-size", cl::init(3), cl::Hidden,
+ cl::desc("Only vectorize small trees if they are fully vectorizable"));
+
+static cl::opt<bool>
+ ViewSLPTree("view-slp-tree", cl::Hidden,
+ cl::desc("Display the SLP trees with Graphviz"));
+
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
+/// Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
+}
+
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return false;
+ BasicBlock *BB = I0->getParent();
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I)
+ return false;
+
+ if (BB != I->getParent())
+ return false;
+ }
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
+static bool allConstant(ArrayRef<Value *> VL) {
+ // Constant expressions and globals can't be vectorized like normal integer/FP
+ // constants.
+ for (Value *i : VL)
+ if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
+ return false;
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+ for (unsigned i = 1, e = VL.size(); i < e; ++i)
+ if (VL[i] != VL[0])
+ return false;
+ return true;
+}
+
+/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
+static bool isCommutative(Instruction *I) {
+ if (auto *IC = dyn_cast<CmpInst>(I))
+ return IC->isCommutative();
+ return I->isCommutative();
+}
+
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
+/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+/// i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
+/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+ auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ Value *Vec1 = nullptr;
+ Value *Vec2 = nullptr;
+ enum ShuffleMode { Unknown, Select, Permute };
+ ShuffleMode CommonShuffleMode = Unknown;
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<ExtractElementInst>(VL[I]);
+ auto *Vec = EI->getVectorOperand();
+ // All vector operands must have the same number of vector elements.
+ if (Vec->getType()->getVectorNumElements() != Size)
+ return None;
+ auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+ if (!Idx)
+ return None;
+ // Undefined behavior if Idx is negative or >= Size.
+ if (Idx->getValue().uge(Size))
+ continue;
+ unsigned IntIdx = Idx->getValue().getZExtValue();
+ // We can extractelement from undef vector.
+ if (isa<UndefValue>(Vec))
+ continue;
+ // For correct shuffling we have to have at most 2 different vector operands
+ // in all extractelement instructions.
+ if (!Vec1 || Vec1 == Vec)
+ Vec1 = Vec;
+ else if (!Vec2 || Vec2 == Vec)
+ Vec2 = Vec;
+ else
+ return None;
+ if (CommonShuffleMode == Permute)
+ continue;
+ // If the extract index is not the same as the operation number, it is a
+ // permutation.
+ if (IntIdx != I) {
+ CommonShuffleMode = Permute;
+ continue;
+ }
+ CommonShuffleMode = Select;
+ }
+ // If we're not crossing lanes in different vectors, consider it as blending.
+ if (CommonShuffleMode == Select && Vec2)
+ return TargetTransformInfo::SK_Select;
+ // If Vec2 was never used, we have a permutation of a single vector, otherwise
+ // we have permutation of 2 vectors.
+ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
+namespace {
+
+/// Main data required for vectorization of instructions.
+struct InstructionsState {
+ /// The very first instruction in the list with the main opcode.
+ Value *OpValue = nullptr;
+
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+ }
+
+ InstructionsState() = delete;
+ InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+ : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+};
+
+} // end anonymous namespace
+
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.OpValue;
+}
+
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState, the Opcode that we suppose the whole list
+/// could be vectorized even if its structure is diverse.
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0) {
+ // Make sure these are all Instructions.
+ if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+ bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+ bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+ unsigned AltOpcode = Opcode;
+ unsigned AltIndex = BaseIndex;
+
+ // Check for one alternate opcode from another BinaryOperator.
+ // TODO - generalize to support all operators (types, calls etc.).
+ for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+ unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+ Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+ Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ }
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ }
+
+ return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+ cast<Instruction>(VL[AltIndex]));
+}
+
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
+ Type *Ty = VL[0]->getType();
+ for (int i = 1, e = VL.size(); i < e; i++)
+ if (VL[i]->getType() != Ty)
+ return false;
+
+ return true;
+}
+
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
+ if (Opcode == Instruction::ExtractElement) {
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return None;
+ return CI->getZExtValue();
+ }
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return None;
+ return *EI->idx_begin();
+}
+
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ if (hasVectorInstrinsicScalarOpd(ID, i))
+ return (CI->getArgOperand(i) == Scalar);
+ }
+ LLVM_FALLTHROUGH;
+ }
+ default:
+ return false;
+ }
+}
+
+/// \returns the AA location that is being access by the instruction.
+static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return MemoryLocation::get(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return MemoryLocation::get(LI);
+ return MemoryLocation();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return true;
+}
+
+namespace llvm {
+
+namespace slpvectorizer {
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+ struct TreeEntry;
+ struct ScheduleData;
+
+public:
+ using ValueList = SmallVector<Value *, 8>;
+ using InstrList = SmallVector<Instruction *, 16>;
+ using ValueSet = SmallPtrSet<Value *, 16>;
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using ExtraValueToDebugLocsMap =
+ MapVector<Value *, SmallVector<Instruction *, 2>>;
+
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
+ : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
+ DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+ }
+
+ /// Vectorize the tree that starts with the elements in \p VL.
+ /// Returns the vectorized root.
+ Value *vectorizeTree();
+
+ /// Vectorize the tree but with the list of externally used values \p
+ /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+ /// generated extractvalue instructions.
+ Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
+
+ /// \returns the cost incurred by unwanted spills and fills, caused by
+ /// holding live values over call sites.
+ int getSpillCost() const;
+
+ /// \returns the vectorization cost of the subtree that starts at \p VL.
+ /// A negative number means that this is profitable.
+ int getTreeCost();
+
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+ void buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst = None);
+
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+ /// into account (anf updating it, if required) list of externally used
+ /// values stored in \p ExternallyUsedValues.
+ void buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst = None);
+
+ /// Clear the internal data structures that are created by 'buildTree'.
+ void deleteTree() {
+ VectorizableTree.clear();
+ ScalarToTreeEntry.clear();
+ MustGather.clear();
+ ExternalUses.clear();
+ NumOpsWantToKeepOrder.clear();
+ NumOpsWantToKeepOriginalOrder = 0;
+ for (auto &Iter : BlocksSchedules) {
+ BlockScheduling *BS = Iter.second.get();
+ BS->clear();
+ }
+ MinBWs.clear();
+ }
+
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
+ /// Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns The best order of instructions for vectorization.
+ Optional<ArrayRef<unsigned>> bestOrder() const {
+ auto I = std::max_element(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+ [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+ return D1.second < D2.second;
+ });
+ if (I == NumOpsWantToKeepOrder.end() ||
+ I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+ return None;
+
+ return makeArrayRef(I->getFirst());
+ }
+
+ /// \return The vector element size in bits to use when vectorizing the
+ /// expression tree ending at \p V. If V is a store, the size is the width of
+ /// the stored value. Otherwise, the size is the width of the largest loaded
+ /// value reaching V. This method is used by the vectorizer to calculate
+ /// vectorization factors.
+ unsigned getVectorElementSize(Value *V) const;
+
+ /// Compute the minimum type sizes required to represent the entries in a
+ /// vectorizable tree.
+ void computeMinimumValueSizes();
+
+ // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+ unsigned getMaxVecRegSize() const {
+ return MaxVecRegSize;
+ }
+
+ // \returns minimum vector register size as set by cl::opt.
+ unsigned getMinVecRegSize() const {
+ return MinVecRegSize;
+ }
+
+ /// Check if ArrayType or StructType is isomorphic to some VectorType.
+ ///
+ /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+ unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
+ /// \returns True if the VectorizableTree is both tiny and not fully
+ /// vectorizable. We do not vectorize such trees.
+ bool isTreeTinyAndNotFullyVectorizable() const;
+
+ /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
+ /// This structure holds any data we need about the edges being traversed
+ /// during buildTree_rec(). We keep track of:
+ /// (i) the user TreeEntry index, and
+ /// (ii) the index of the edge.
+ struct EdgeInfo {
+ EdgeInfo() = default;
+ EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
+ : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
+ /// The user TreeEntry.
+ TreeEntry *UserTE = nullptr;
+ /// The operand index of the use.
+ unsigned EdgeIdx = UINT_MAX;
+#ifndef NDEBUG
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const BoUpSLP::EdgeInfo &EI) {
+ EI.dump(OS);
+ return OS;
+ }
+ /// Debug print.
+ void dump(raw_ostream &OS) const {
+ OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
+ << " EdgeIdx:" << EdgeIdx << "}";
+ }
+ LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
+#endif
+ };
+
+ /// A helper data structure to hold the operands of a vector of instructions.
+ /// This supports a fixed vector length for all operand vectors.
+ class VLOperands {
+ /// For each operand we need (i) the value, and (ii) the opcode that it
+ /// would be attached to if the expression was in a left-linearized form.
+ /// This is required to avoid illegal operand reordering.
+ /// For example:
+ /// \verbatim
+ /// 0 Op1
+ /// |/
+ /// Op1 Op2 Linearized + Op2
+ /// \ / ----------> |/
+ /// - -
+ ///
+ /// Op1 - Op2 (0 + Op1) - Op2
+ /// \endverbatim
+ ///
+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+ ///
+ /// Another way to think of this is to track all the operations across the
+ /// path from the operand all the way to the root of the tree and to
+ /// calculate the operation that corresponds to this path. For example, the
+ /// path from Op2 to the root crosses the RHS of the '-', therefore the
+ /// corresponding operation is a '-' (which matches the one in the
+ /// linearized tree, as shown above).
+ ///
+ /// For lack of a better term, we refer to this operation as Accumulated
+ /// Path Operation (APO).
+ struct OperandData {
+ OperandData() = default;
+ OperandData(Value *V, bool APO, bool IsUsed)
+ : V(V), APO(APO), IsUsed(IsUsed) {}
+ /// The operand value.
+ Value *V = nullptr;
+ /// TreeEntries only allow a single opcode, or an alternate sequence of
+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+ /// (e.g., Add/Mul)
+ bool APO = false;
+ /// Helper data for the reordering function.
+ bool IsUsed = false;
+ };
+
+ /// During operand reordering, we are trying to select the operand at lane
+ /// that matches best with the operand at the neighboring lane. Our
+ /// selection is based on the type of value we are looking for. For example,
+ /// if the neighboring lane has a load, we need to look for a load that is
+ /// accessing a consecutive address. These strategies are summarized in the
+ /// 'ReorderingMode' enumerator.
+ enum class ReorderingMode {
+ Load, ///< Matching loads to consecutive memory addresses
+ Opcode, ///< Matching instructions based on opcode (same or alternate)
+ Constant, ///< Matching constants
+ Splat, ///< Matching the same instruction multiple times (broadcast)
+ Failed, ///< We failed to create a vectorizable group
+ };
+
+ using OperandDataVec = SmallVector<OperandData, 2>;
+
+ /// A vector of operand vectors.
+ SmallVector<OperandDataVec, 4> OpsVec;
+
+ const DataLayout &DL;
+ ScalarEvolution &SE;
+
+ /// \returns the operand data at \p OpIdx and \p Lane.
+ OperandData &getData(unsigned OpIdx, unsigned Lane) {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// Clears the used flag for all entries.
+ void clearUsed() {
+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane)
+ OpsVec[OpIdx][Lane].IsUsed = false;
+ }
+
+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+ }
+
+ // Search all operands in Ops[*][Lane] for the one that matches best
+ // Ops[OpIdx][LastLane] and return its opreand index.
+ // If no good match can be found, return None.
+ Optional<unsigned>
+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes) {
+ unsigned NumOperands = getNumOperands();
+
+ // The operand of the previous lane at OpIdx.
+ Value *OpLastLane = getData(OpIdx, LastLane).V;
+
+ // Our strategy mode for OpIdx.
+ ReorderingMode RMode = ReorderingModes[OpIdx];
+
+ // The linearized opcode of the operand at OpIdx, Lane.
+ bool OpIdxAPO = getData(OpIdx, Lane).APO;
+
+ const unsigned BestScore = 2;
+ const unsigned GoodScore = 1;
+
+ // The best operand index and its score.
+ // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
+ // are using the score to differentiate between the two.
+ struct BestOpData {
+ Optional<unsigned> Idx = None;
+ unsigned Score = 0;
+ } BestOp;
+
+ // Iterate through all unused operands and look for the best.
+ for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
+ // Get the operand at Idx and Lane.
+ OperandData &OpData = getData(Idx, Lane);
+ Value *Op = OpData.V;
+ bool OpAPO = OpData.APO;
+
+ // Skip already selected operands.
+ if (OpData.IsUsed)
+ continue;
+
+ // Skip if we are trying to move the operand to a position with a
+ // different opcode in the linearized tree form. This would break the
+ // semantics.
+ if (OpAPO != OpIdxAPO)
+ continue;
+
+ // Look for an operand that matches the current mode.
+ switch (RMode) {
+ case ReorderingMode::Load:
+ if (isa<LoadInst>(Op)) {
+ // Figure out which is left and right, so that we can check for
+ // consecutive loads
+ bool LeftToRight = Lane > LastLane;
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+ if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
+ cast<LoadInst>(OpRight), DL, SE))
+ BestOp.Idx = Idx;
+ }
+ break;
+ case ReorderingMode::Opcode:
+ // We accept both Instructions and Undefs, but with different scores.
+ if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
+ cast<Instruction>(Op)->getOpcode() ==
+ cast<Instruction>(OpLastLane)->getOpcode()) ||
+ (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
+ isa<UndefValue>(Op)) {
+ // An instruction has a higher score than an undef.
+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
+ }
+ }
+ break;
+ case ReorderingMode::Constant:
+ if (isa<Constant>(Op)) {
+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
+ }
+ }
+ break;
+ case ReorderingMode::Splat:
+ if (Op == OpLastLane)
+ BestOp.Idx = Idx;
+ break;
+ case ReorderingMode::Failed:
+ return None;
+ }
+ }
+
+ if (BestOp.Idx) {
+ getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+ return BestOp.Idx;
+ }
+ // If we could not find a good match return None.
+ return None;
+ }
+
+ /// Helper for reorderOperandVecs. \Returns the lane that we should start
+ /// reordering from. This is the one which has the least number of operands
+ /// that can freely move about.
+ unsigned getBestLaneToStartReordering() const {
+ unsigned BestLane = 0;
+ unsigned Min = UINT_MAX;
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane) {
+ unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
+ if (NumFreeOps < Min) {
+ Min = NumFreeOps;
+ BestLane = Lane;
+ }
+ }
+ return BestLane;
+ }
+
+ /// \Returns the maximum number of operands that are allowed to be reordered
+ /// for \p Lane. This is used as a heuristic for selecting the first lane to
+ /// start operand reordering.
+ unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+ unsigned CntTrue = 0;
+ unsigned NumOperands = getNumOperands();
+ // Operands with the same APO can be reordered. We therefore need to count
+ // how many of them we have for each APO, like this: Cnt[APO] = x.
+ // Since we only have two APOs, namely true and false, we can avoid using
+ // a map. Instead we can simply count the number of operands that
+ // correspond to one of them (in this case the 'true' APO), and calculate
+ // the other by subtracting it from the total number of operands.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
+ if (getData(OpIdx, Lane).APO)
+ ++CntTrue;
+ unsigned CntFalse = NumOperands - CntTrue;
+ return std::max(CntTrue, CntFalse);
+ }
+
+ /// Go through the instructions in VL and append their operands.
+ void appendOperandsOfVL(ArrayRef<Value *> VL) {
+ assert(!VL.empty() && "Bad VL");
+ assert((empty() || VL.size() == getNumLanes()) &&
+ "Expected same number of lanes");
+ assert(isa<Instruction>(VL[0]) && "Expected instruction");
+ unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+ OpsVec.resize(NumOperands);
+ unsigned NumLanes = VL.size();
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ OpsVec[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+ // Our tree has just 3 nodes: the root and two operands.
+ // It is therefore trivial to get the APO. We only need to check the
+ // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
+ // RHS operand. The LHS operand of both add and sub is never attached
+ // to an inversese operation in the linearized form, therefore its APO
+ // is false. The RHS is true only if VL[Lane] is an inverse operation.
+
+ // Since operand reordering is performed on groups of commutative
+ // operations or alternating sequences (e.g., +, -), we can safely
+ // tell the inverse operations by checking commutativity.
+ bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+ bool APO = (OpIdx == 0) ? false : IsInverseOperation;
+ OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
+ APO, false};
+ }
+ }
+ }
+
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return OpsVec.size(); }
+
+ /// \returns the number of lanes.
+ unsigned getNumLanes() const { return OpsVec[0].size(); }
+
+ /// \returns the operand value at \p OpIdx and \p Lane.
+ Value *getValue(unsigned OpIdx, unsigned Lane) const {
+ return getData(OpIdx, Lane).V;
+ }
+
+ /// \returns true if the data structure is empty.
+ bool empty() const { return OpsVec.empty(); }
+
+ /// Clears the data.
+ void clear() { OpsVec.clear(); }
+
+ /// \Returns true if there are enough operands identical to \p Op to fill
+ /// the whole vector.
+ /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
+ bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ bool OpAPO = getData(OpIdx, Lane).APO;
+ for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ // This is set to true if we found a candidate for broadcast at Lane.
+ bool FoundCandidate = false;
+ for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
+ OperandData &Data = getData(OpI, Ln);
+ if (Data.APO != OpAPO || Data.IsUsed)
+ continue;
+ if (Data.V == Op) {
+ FoundCandidate = true;
+ Data.IsUsed = true;
+ break;
+ }
+ }
+ if (!FoundCandidate)
+ return false;
+ }
+ return true;
+ }
+
+ public:
+ /// Initialize with all the operands of the instruction vector \p RootVL.
+ VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+ ScalarEvolution &SE)
+ : DL(DL), SE(SE) {
+ // Append all the operands of RootVL.
+ appendOperandsOfVL(RootVL);
+ }
+
+ /// \Returns a value vector with the operands across all lanes for the
+ /// opearnd at \p OpIdx.
+ ValueList getVL(unsigned OpIdx) const {
+ ValueList OpVL(OpsVec[OpIdx].size());
+ assert(OpsVec[OpIdx].size() == getNumLanes() &&
+ "Expected same num of lanes across all operands");
+ for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
+ OpVL[Lane] = OpsVec[OpIdx][Lane].V;
+ return OpVL;
+ }
+
+ // Performs operand reordering for 2 or more operands.
+ // The original operands are in OrigOps[OpIdx][Lane].
+ // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
+ void reorder() {
+ unsigned NumOperands = getNumOperands();
+ unsigned NumLanes = getNumLanes();
+ // Each operand has its own mode. We are using this mode to help us select
+ // the instructions for each lane, so that they match best with the ones
+ // we have selected so far.
+ SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+
+ // This is a greedy single-pass algorithm. We are going over each lane
+ // once and deciding on the best order right away with no back-tracking.
+ // However, in order to increase its effectiveness, we start with the lane
+ // that has operands that can move the least. For example, given the
+ // following lanes:
+ // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
+ // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
+ // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
+ // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
+ // we will start at Lane 1, since the operands of the subtraction cannot
+ // be reordered. Then we will visit the rest of the lanes in a circular
+ // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
+
+ // Find the first lane that we will start our search from.
+ unsigned FirstLane = getBestLaneToStartReordering();
+
+ // Initialize the modes.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ Value *OpLane0 = getValue(OpIdx, FirstLane);
+ // Keep track if we have instructions with all the same opcode on one
+ // side.
+ if (isa<LoadInst>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
+ else if (isa<Instruction>(OpLane0)) {
+ // Check if OpLane0 should be broadcast.
+ if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ ReorderingModes[OpIdx] = ReorderingMode::Opcode;
+ }
+ else if (isa<Constant>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Constant;
+ else if (isa<Argument>(OpLane0))
+ // Our best hope is a Splat. It may save some cost in some cases.
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ // NOTE: This should be unreachable.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ }
+
+ // If the initial strategy fails for any of the operand indexes, then we
+ // perform reordering again in a second pass. This helps avoid assigning
+ // high priority to the failed strategy, and should improve reordering for
+ // the non-failed operand indexes.
+ for (int Pass = 0; Pass != 2; ++Pass) {
+ // Skip the second pass if the first pass did not fail.
+ bool StrategyFailed = false;
+ // Mark all operand data as free to use.
+ clearUsed();
+ // We keep the original operand order for the FirstLane, so reorder the
+ // rest of the lanes. We are visiting the nodes in a circular fashion,
+ // using FirstLane as the center point and increasing the radius
+ // distance.
+ for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
+ // Visit the lane on the right and then the lane on the left.
+ for (int Direction : {+1, -1}) {
+ int Lane = FirstLane + Direction * Distance;
+ if (Lane < 0 || Lane >= (int)NumLanes)
+ continue;
+ int LastLane = Lane - Direction;
+ assert(LastLane >= 0 && LastLane < (int)NumLanes &&
+ "Out of bounds");
+ // Look for a good match for each operand.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ // Search for the operand that matches SortedOps[OpIdx][Lane-1].
+ Optional<unsigned> BestIdx =
+ getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+ // By not selecting a value, we allow the operands that follow to
+ // select a better matching value. We will get a non-null value in
+ // the next run of getBestOperand().
+ if (BestIdx) {
+ // Swap the current operand with the one returned by
+ // getBestOperand().
+ swap(OpIdx, BestIdx.getValue(), Lane);
+ } else {
+ // We failed to find a best operand, set mode to 'Failed'.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ // Enable the second pass.
+ StrategyFailed = true;
+ }
+ }
+ }
+ }
+ // Skip second pass if the strategy did not fail.
+ if (!StrategyFailed)
+ break;
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
+ switch (RMode) {
+ case ReorderingMode::Load:
+ return "Load";
+ case ReorderingMode::Opcode:
+ return "Opcode";
+ case ReorderingMode::Constant:
+ return "Constant";
+ case ReorderingMode::Splat:
+ return "Splat";
+ case ReorderingMode::Failed:
+ return "Failed";
+ }
+ llvm_unreachable("Unimplemented Reordering Type");
+ }
+
+ LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
+ raw_ostream &OS) {
+ return OS << getModeStr(RMode);
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
+ printMode(RMode, dbgs());
+ }
+
+ friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
+ return printMode(RMode, OS);
+ }
+
+ LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
+ const unsigned Indent = 2;
+ unsigned Cnt = 0;
+ for (const OperandDataVec &OpDataVec : OpsVec) {
+ OS << "Operand " << Cnt++ << "\n";
+ for (const OperandData &OpData : OpDataVec) {
+ OS.indent(Indent) << "{";
+ if (Value *V = OpData.V)
+ OS << *V;
+ else
+ OS << "null";
+ OS << ", APO:" << OpData.APO << "}\n";
+ }
+ OS << "\n";
+ }
+ return OS;
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+ };
+
+ /// Checks if the instruction is marked for deletion.
+ bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+ /// Marks values operands for later deletion by replacing them with Undefs.
+ void eraseInstructions(ArrayRef<Value *> AV);
+
+ ~BoUpSLP();
+
+private:
+ /// Checks if all users of \p I are the part of the vectorization tree.
+ bool areAllUsersVectorized(Instruction *I) const;
+
+ /// \returns the cost of the vectorizable entry.
+ int getEntryCost(TreeEntry *E);
+
+ /// This is the recursive part of buildTree.
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
+ const EdgeInfo &EI);
+
+ /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a
+ /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+ /// returns false, setting \p CurrentOrder to either an empty vector or a
+ /// non-identity permutation that allows to reuse extract instructions.
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const;
+
+ /// Vectorize a single entry in the tree.
+ Value *vectorizeTree(TreeEntry *E);
+
+ /// Vectorize a single entry in the tree, starting in \p VL.
+ Value *vectorizeTree(ArrayRef<Value *> VL);
+
+ /// \returns the scalarization cost for this type. Scalarization in this
+ /// context means the creation of vectors from a group of scalars.
+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
+
+ /// \returns the scalarization cost for this list of values. Assuming that
+ /// this subtree gets vectorized, we may need to extract the values from the
+ /// roots. This method calculates the cost of extracting the values.
+ int getGatherCost(ArrayRef<Value *> VL) const;
+
+ /// Set the Builder insert point to one after the last instruction in
+ /// the bundle
+ void setInsertPointAfterBundle(TreeEntry *E);
+
+ /// \returns a vector from a collection of scalars in \p VL.
+ Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+
+ /// \returns whether the VectorizableTree is fully vectorizable and will
+ /// be beneficial even the tree height is tiny.
+ bool isFullyVectorizableTinyTree() const;
+
+ /// Reorder commutative or alt operands to get better probability of
+ /// generating vectorized code.
+ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ const DataLayout &DL,
+ ScalarEvolution &SE);
+ struct TreeEntry {
+ using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
+ TreeEntry(VecTreeTy &Container) : Container(Container) {}
+
+ /// \returns true if the scalars in VL are equal to this entry.
+ bool isSame(ArrayRef<Value *> VL) const {
+ if (VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == ReuseShuffleIndices.size() &&
+ std::equal(
+ VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+ [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+ }
+
+ /// A vector of scalars.
+ ValueList Scalars;
+
+ /// The Scalars are vectorized into this value. It is initialized to Null.
+ Value *VectorizedValue = nullptr;
+
+ /// Do we need to gather this sequence ?
+ bool NeedToGather = false;
+
+ /// Does this sequence require some shuffling?
+ SmallVector<unsigned, 4> ReuseShuffleIndices;
+
+ /// Does this entry require reordering?
+ ArrayRef<unsigned> ReorderIndices;
+
+ /// Points back to the VectorizableTree.
+ ///
+ /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
+ /// to be a pointer and needs to be able to initialize the child iterator.
+ /// Thus we need a reference back to the container to translate the indices
+ /// to entries.
+ VecTreeTy &Container;
+
+ /// The TreeEntry index containing the user of this entry. We can actually
+ /// have multiple users so the data structure is not truly a tree.
+ SmallVector<EdgeInfo, 1> UserTreeIndices;
+
+ /// The index of this treeEntry in VectorizableTree.
+ int Idx = -1;
+
+ private:
+ /// The operands of each instruction in each lane Operands[op_index][lane].
+ /// Note: This helps avoid the replication of the code that performs the
+ /// reordering of operands during buildTree_rec() and vectorizeTree().
+ SmallVector<ValueList, 2> Operands;
+
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ public:
+ /// Set this bundle's \p OpIdx'th operand to \p OpVL.
+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
+ if (Operands.size() < OpIdx + 1)
+ Operands.resize(OpIdx + 1);
+ assert(Operands[OpIdx].size() == 0 && "Already resized?");
+ Operands[OpIdx].resize(Scalars.size());
+ for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
+ Operands[OpIdx][Lane] = OpVL[Lane];
+ }
+
+ /// Set the operands of this bundle in their original order.
+ void setOperandsInOrder() {
+ assert(Operands.empty() && "Already initialized?");
+ auto *I0 = cast<Instruction>(Scalars[0]);
+ Operands.resize(I0->getNumOperands());
+ unsigned NumLanes = Scalars.size();
+ for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx) {
+ Operands[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ auto *I = cast<Instruction>(Scalars[Lane]);
+ assert(I->getNumOperands() == NumOperands &&
+ "Expected same number of operands");
+ Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+ }
+ }
+ }
+
+ /// \returns the \p OpIdx operand of this TreeEntry.
+ ValueList &getOperand(unsigned OpIdx) {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ return Operands[OpIdx];
+ }
+
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return Operands.size(); }
+
+ /// \return the single \p OpIdx operand.
+ Value *getSingleOperand(unsigned OpIdx) const {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ assert(!Operands[OpIdx].empty() && "No operand available");
+ return Operands[OpIdx][0];
+ }
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool isAltShuffle() const {
+ return getOpcode() != getAltOpcode();
+ }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return (getOpcode() == CheckedOpcode ||
+ getAltOpcode() == CheckedOpcode);
+ }
+
+ /// Chooses the correct key for scheduling data. If \p Op has the same (or
+ /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+ /// \p OpValue.
+ Value *isOneOf(Value *Op) const {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && isOpcodeOrAlt(I))
+ return Op;
+ return MainOp;
+ }
+
+ void setOperations(const InstructionsState &S) {
+ MainOp = S.MainOp;
+ AltOp = S.AltOp;
+ }
+
+ Instruction *getMainOp() const {
+ return MainOp;
+ }
+
+ Instruction *getAltOp() const {
+ return AltOp;
+ }
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
+
+ /// Update operations state of this entry if reorder occurred.
+ bool updateStateIfReorder() {
+ if (ReorderIndices.empty())
+ return false;
+ InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+ setOperations(S);
+ return true;
+ }
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dump() const {
+ dbgs() << Idx << ".\n";
+ for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
+ dbgs() << "Operand " << OpI << ":\n";
+ for (const Value *V : Operands[OpI])
+ dbgs().indent(2) << *V << "\n";
+ }
+ dbgs() << "Scalars: \n";
+ for (Value *V : Scalars)
+ dbgs().indent(2) << *V << "\n";
+ dbgs() << "NeedToGather: " << NeedToGather << "\n";
+ dbgs() << "MainOp: " << *MainOp << "\n";
+ dbgs() << "AltOp: " << *AltOp << "\n";
+ dbgs() << "VectorizedValue: ";
+ if (VectorizedValue)
+ dbgs() << *VectorizedValue;
+ else
+ dbgs() << "NULL";
+ dbgs() << "\n";
+ dbgs() << "ReuseShuffleIndices: ";
+ if (ReuseShuffleIndices.empty())
+ dbgs() << "Emtpy";
+ else
+ for (unsigned ReuseIdx : ReuseShuffleIndices)
+ dbgs() << ReuseIdx << ", ";
+ dbgs() << "\n";
+ dbgs() << "ReorderIndices: ";
+ for (unsigned ReorderIdx : ReorderIndices)
+ dbgs() << ReorderIdx << ", ";
+ dbgs() << "\n";
+ dbgs() << "UserTreeIndices: ";
+ for (const auto &EInfo : UserTreeIndices)
+ dbgs() << EInfo << ", ";
+ dbgs() << "\n";
+ }
+#endif
+ };
+
+ /// Create a new VectorizableTree entry.
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+ const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
+ bool Vectorized = (bool)Bundle;
+ VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
+ TreeEntry *Last = VectorizableTree.back().get();
+ Last->Idx = VectorizableTree.size() - 1;
+ Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+ Last->NeedToGather = !Vectorized;
+ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+ ReuseShuffleIndices.end());
+ Last->ReorderIndices = ReorderIndices;
+ Last->setOperations(S);
+ if (Vectorized) {
+ for (int i = 0, e = VL.size(); i != e; ++i) {
+ assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
+ ScalarToTreeEntry[VL[i]] = Last;
+ }
+ // Update the scheduler bundle to point to this TreeEntry.
+ unsigned Lane = 0;
+ for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ BundleMember->TE = Last;
+ BundleMember->Lane = Lane;
+ ++Lane;
+ }
+ assert((!Bundle.getValue() || Lane == VL.size()) &&
+ "Bundle and VL out of sync");
+ } else {
+ MustGather.insert(VL.begin(), VL.end());
+ }
+
+ if (UserTreeIdx.UserTE)
+ Last->UserTreeIndices.push_back(UserTreeIdx);
+
+ return Last;
+ }
+
+ /// -- Vectorization State --
+ /// Holds all of the tree entries.
+ TreeEntry::VecTreeTy VectorizableTree;
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dumpVectorizableTree() const {
+ for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+ VectorizableTree[Id]->dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
+
+ TreeEntry *getTreeEntry(Value *V) {
+ auto I = ScalarToTreeEntry.find(V);
+ if (I != ScalarToTreeEntry.end())
+ return I->second;
+ return nullptr;
+ }
+
+ const TreeEntry *getTreeEntry(Value *V) const {
+ auto I = ScalarToTreeEntry.find(V);
+ if (I != ScalarToTreeEntry.end())
+ return I->second;
+ return nullptr;
+ }
+
+ /// Maps a specific scalar to its tree entry.
+ SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+
+ /// A list of scalars that we found that we need to keep as scalars.
+ ValueSet MustGather;
+
+ /// This POD struct describes one external user in the vectorized tree.
+ struct ExternalUser {
+ ExternalUser(Value *S, llvm::User *U, int L)
+ : Scalar(S), User(U), Lane(L) {}
+
+ // Which scalar in our function.
+ Value *Scalar;
+
+ // Which user that uses the scalar.
+ llvm::User *User;
+
+ // Which lane does the scalar belong to.
+ int Lane;
+ };
+ using UserList = SmallVector<ExternalUser, 16>;
+
+ /// Checks if two instructions may access the same memory.
+ ///
+ /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+ /// is invariant in the calling loop.
+ bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+ Instruction *Inst2) {
+ // First check if the result is already in the cache.
+ AliasCacheKey key = std::make_pair(Inst1, Inst2);
+ Optional<bool> &result = AliasCache[key];
+ if (result.hasValue()) {
+ return result.getValue();
+ }
+ MemoryLocation Loc2 = getLocation(Inst2, AA);
+ bool aliased = true;
+ if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+ // Do the alias check.
+ aliased = AA->alias(Loc1, Loc2);
+ }
+ // Store the result in the cache.
+ result = aliased;
+ return aliased;
+ }
+
+ using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+
+ /// Cache for alias results.
+ /// TODO: consider moving this to the AliasAnalysis itself.
+ DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ /// This is required to ensure that there are no incorrect collisions in the
+ /// AliasCache, which can happen if a new instruction is allocated at the
+ /// same address as a previously deleted instruction.
+ void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+ auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+ It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
+ }
+
+ /// Temporary store for deleted instructions. Instructions will be deleted
+ /// eventually when the BoUpSLP is destructed.
+ DenseMap<Instruction *, bool> DeletedInstructions;
+
+ /// A list of values that need to extracted out of the tree.
+ /// This list holds pairs of (Internal Scalar : External User). External User
+ /// can be nullptr, it means that this Internal Scalar will be used later,
+ /// after vectorization.
+ UserList ExternalUses;
+
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
+
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> GatherSeq;
+
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> CSEBlocks;
+
+ /// Contains all scheduling relevant data for an instruction.
+ /// A ScheduleData either represents a single instruction or a member of an
+ /// instruction bundle (= a group of instructions which is combined into a
+ /// vector instruction).
+ struct ScheduleData {
+ // The initial value for the dependency counters. It means that the
+ // dependencies are not calculated yet.
+ enum { InvalidDeps = -1 };
+
+ ScheduleData() = default;
+
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
+ FirstInBundle = this;
+ NextInBundle = nullptr;
+ NextLoadStore = nullptr;
+ IsScheduled = false;
+ SchedulingRegionID = BlockSchedulingRegionID;
+ UnscheduledDepsInBundle = UnscheduledDeps;
+ clearDependencies();
+ OpValue = OpVal;
+ TE = nullptr;
+ Lane = -1;
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+ /// Returns true for single instructions and for bundle representatives
+ /// (= the head of a bundle).
+ bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+ /// Returns true if it represents an instruction bundle and not only a
+ /// single instruction.
+ bool isPartOfBundle() const {
+ return NextInBundle != nullptr || FirstInBundle != this;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const {
+ assert(isSchedulingEntity() &&
+ "can't consider non-scheduling entity for ready list");
+ return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ }
+
+ /// Modifies the number of unscheduled dependencies, also updating it for
+ /// the whole bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ UnscheduledDeps += Incr;
+ return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() {
+ incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
+ MemoryDependencies.clear();
+ }
+
+ void dump(raw_ostream &os) const {
+ if (!isSchedulingEntity()) {
+ os << "/ " << *Inst;
+ } else if (NextInBundle) {
+ os << '[' << *Inst;
+ ScheduleData *SD = NextInBundle;
+ while (SD) {
+ os << ';' << *SD->Inst;
+ SD = SD->NextInBundle;
+ }
+ os << ']';
+ } else {
+ os << *Inst;
+ }
+ }
+
+ Instruction *Inst = nullptr;
+
+ /// Points to the head in an instruction bundle (and always to this for
+ /// single instructions).
+ ScheduleData *FirstInBundle = nullptr;
+
+ /// Single linked list of all instructions in a bundle. Null if it is a
+ /// single instruction.
+ ScheduleData *NextInBundle = nullptr;
+
+ /// Single linked list of all memory instructions (e.g. load, store, call)
+ /// in the block - until the end of the scheduling region.
+ ScheduleData *NextLoadStore = nullptr;
+
+ /// The dependent memory instructions.
+ /// This list is derived on demand in calculateDependencies().
+ SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID = 0;
+
+ /// Used for getting a "good" final ordering of instructions.
+ int SchedulingPriority = 0;
+
+ /// The number of dependencies. Constitutes of the number of users of the
+ /// instruction plus the number of dependent memory instructions (if any).
+ /// This value is calculated on demand.
+ /// If InvalidDeps, the number of dependencies is not calculated yet.
+ int Dependencies = InvalidDeps;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps = InvalidDeps;
+
+ /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+ /// single instructions.
+ int UnscheduledDepsInBundle = InvalidDeps;
+
+ /// True if this instruction is scheduled (or considered as scheduled in the
+ /// dry-run).
+ bool IsScheduled = false;
+
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
+ /// The TreeEntry that this instruction corresponds to.
+ TreeEntry *TE = nullptr;
+
+ /// The lane of this node in the TreeEntry.
+ int Lane = -1;
+ };
+
+#ifndef NDEBUG
+ friend inline raw_ostream &operator<<(raw_ostream &os,
+ const BoUpSLP::ScheduleData &SD) {
+ SD.dump(os);
+ return os;
+ }
+#endif
+
+ friend struct GraphTraits<BoUpSLP *>;
+ friend struct DOTGraphTraits<BoUpSLP *>;
+
+ /// Contains all scheduling data for a basic block.
+ struct BlockScheduling {
+ BlockScheduling(BasicBlock *BB)
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
+
+ void clear() {
+ ReadyInsts.clear();
+ ScheduleStart = nullptr;
+ ScheduleEnd = nullptr;
+ FirstLoadStoreInRegion = nullptr;
+ LastLoadStoreInRegion = nullptr;
+
+ // Reduce the maximum schedule region size by the size of the
+ // previous scheduling run.
+ ScheduleRegionSizeLimit -= ScheduleRegionSize;
+ if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+ ScheduleRegionSizeLimit = MinScheduleRegionSize;
+ ScheduleRegionSize = 0;
+
+ // Make a new scheduling region, i.e. all existing ScheduleData is not
+ // in the new region yet.
+ ++SchedulingRegionID;
+ }
+
+ ScheduleData *getScheduleData(Value *V) {
+ ScheduleData *SD = ScheduleDataMap[V];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ return nullptr;
+ }
+
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ if (V == Key)
+ return getScheduleData(V);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end()) {
+ ScheduleData *SD = I->second[Key];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ }
+ return nullptr;
+ }
+
+ bool isInSchedulingRegion(ScheduleData *SD) {
+ return SD->SchedulingRegionID == SchedulingRegionID;
+ }
+
+ /// Marks an instruction as scheduled and puts all dependent ready
+ /// instructions into the ready-list.
+ template <typename ReadyListType>
+ void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+ SD->IsScheduled = true;
+ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ if (BundleMember->Inst != BundleMember->OpValue) {
+ BundleMember = BundleMember->NextInBundle;
+ continue;
+ }
+ // Handle the def-use chain dependencies.
+
+ // Decrement the unscheduled counter and insert to ready list if ready.
+ auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
+ doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
+ };
+
+ // If BundleMember is a vector bundle, its operands may have been
+ // reordered duiring buildTree(). We therefore need to get its operands
+ // through the TreeEntry.
+ if (TreeEntry *TE = BundleMember->TE) {
+ int Lane = BundleMember->Lane;
+ assert(Lane >= 0 && "Lane not set");
+ for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+ DecrUnsched(I);
+ } else {
+ // If BundleMember is a stand-alone instruction, no operand reordering
+ // has taken place, so we directly access its operands.
+ for (Use &U : BundleMember->Inst->operands())
+ if (auto *I = dyn_cast<Instruction>(U.get()))
+ DecrUnsched(I);
+ }
+ // Handle the memory dependencies.
+ for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+ if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ if (ScheduleData *SD = getScheduleData(V))
+ Action(SD);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (P.second->SchedulingRegionID == SchedulingRegionID)
+ Action(P.second);
+ }
+
+ /// Put all instructions into the ReadyList which are ready for scheduling.
+ template <typename ReadyListType>
+ void initialFillReadyList(ReadyListType &ReadyList) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *I << "\n");
+ }
+ });
+ }
+ }
+
+ /// Checks if a bundle of instructions can be scheduled, i.e. has no
+ /// cyclic dependencies. This is only a dry-run, no instructions are
+ /// actually moved at this stage.
+ /// \returns the scheduling bundle. The returned Optional value is non-None
+ /// if \p VL is allowed to be scheduled.
+ Optional<ScheduleData *>
+ tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
+
+ /// Un-bundles a group of instructions.
+ void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+
+ /// Allocates schedule data chunk.
+ ScheduleData *allocateScheduleDataChunks();
+
+ /// Extends the scheduling region so that V is inside the region.
+ /// \returns true if the region size is within the limit.
+ bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+
+ /// Initialize the ScheduleData structures for new instructions in the
+ /// scheduling region.
+ void initScheduleData(Instruction *FromI, Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore);
+
+ /// Updates the dependency information of a bundle and of all instructions/
+ /// bundles which depend on the original bundle.
+ void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+ BoUpSLP *SLP);
+
+ /// Sets all instruction in the scheduling region to un-scheduled.
+ void resetSchedule();
+
+ BasicBlock *BB;
+
+ /// Simple memory allocation for ScheduleData.
+ std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+ /// The size of a ScheduleData array in ScheduleDataChunks.
+ int ChunkSize;
+
+ /// The allocator position in the current chunk, which is the last entry
+ /// of ScheduleDataChunks.
+ int ChunkPos;
+
+ /// Attaches ScheduleData to Instruction.
+ /// Note that the mapping survives during all vectorization iterations, i.e.
+ /// ScheduleData structures are recycled.
+ DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+ /// Attaches ScheduleData to Instruction with the leading key.
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+ ExtraScheduleDataMap;
+
+ struct ReadyList : SmallVector<ScheduleData *, 8> {
+ void insert(ScheduleData *SD) { push_back(SD); }
+ };
+
+ /// The ready-list for scheduling (only used for the dry-run).
+ ReadyList ReadyInsts;
+
+ /// The first instruction of the scheduling region.
+ Instruction *ScheduleStart = nullptr;
+
+ /// The first instruction _after_ the scheduling region.
+ Instruction *ScheduleEnd = nullptr;
+
+ /// The first memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *FirstLoadStoreInRegion = nullptr;
+
+ /// The last memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *LastLoadStoreInRegion = nullptr;
+
+ /// The current size of the scheduling region.
+ int ScheduleRegionSize = 0;
+
+ /// The maximum size allowed for the scheduling region.
+ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
+
+ /// The ID of the scheduling region. For a new vectorization iteration this
+ /// is incremented which "removes" all ScheduleData from the region.
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ int SchedulingRegionID = 1;
+ };
+
+ /// Attaches the BlockScheduling structures to basic blocks.
+ MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+ /// Performs the "real" scheduling. Done before vectorization is actually
+ /// performed in a basic block.
+ void scheduleBlock(BlockScheduling *BS);
+
+ /// List of users to ignore during scheduling and that don't need extracting.
+ ArrayRef<Value *> UserIgnoreList;
+
+ using OrdersType = SmallVector<unsigned, 4>;
+ /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+ /// sorted SmallVectors of unsigned.
+ struct OrdersTypeDenseMapInfo {
+ static OrdersType getEmptyKey() {
+ OrdersType V;
+ V.push_back(~1U);
+ return V;
+ }
+
+ static OrdersType getTombstoneKey() {
+ OrdersType V;
+ V.push_back(~2U);
+ return V;
+ }
+
+ static unsigned getHashValue(const OrdersType &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+ return LHS == RHS;
+ }
+ };
+
+ /// Contains orders of operations along with the number of bundles that have
+ /// operations in this order. It stores only those orders that require
+ /// reordering, if reordering is not required it is counted using \a
+ /// NumOpsWantToKeepOriginalOrder.
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+ /// Number of bundles that do not require reordering.
+ unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+ // Analysis and block reference.
+ Function *F;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+ TargetLibraryInfo *TLI;
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ AssumptionCache *AC;
+ DemandedBits *DB;
+ const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+ unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+
+ /// Instruction builder to construct the vectorized tree.
+ IRBuilder<> Builder;
+
+ /// A map of scalar integer values to the smallest bit width with which they
+ /// can legally be represented. The values map to (width, signed) pairs,
+ /// where "width" indicates the minimum bit width and "signed" is True if the
+ /// value must be signed-extended, rather than zero-extended, back to its
+ /// original width.
+ MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+};
+
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+ using TreeEntry = BoUpSLP::TreeEntry;
+
+ /// NodeRef has to be a pointer per the GraphWriter.
+ using NodeRef = TreeEntry *;
+
+ using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+
+ /// Add the VectorizableTree to the index iterator to be able to return
+ /// TreeEntry pointers.
+ struct ChildIteratorType
+ : public iterator_adaptor_base<
+ ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
+ ContainerTy &VectorizableTree;
+
+ ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
+ ContainerTy &VT)
+ : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+ NodeRef operator*() { return I->UserTE; }
+ };
+
+ static NodeRef getEntryNode(BoUpSLP &R) {
+ return R.VectorizableTree[0].get();
+ }
+
+ static ChildIteratorType child_begin(NodeRef N) {
+ return {N->UserTreeIndices.begin(), N->Container};
+ }
+
+ static ChildIteratorType child_end(NodeRef N) {
+ return {N->UserTreeIndices.end(), N->Container};
+ }
+
+ /// For the node iterator we just need to turn the TreeEntry iterator into a
+ /// TreeEntry* iterator so that it dereferences to NodeRef.
+ class nodes_iterator {
+ using ItTy = ContainerTy::iterator;
+ ItTy It;
+
+ public:
+ nodes_iterator(const ItTy &It2) : It(It2) {}
+ NodeRef operator*() { return It->get(); }
+ nodes_iterator operator++() {
+ ++It;
+ return *this;
+ }
+ bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
+ };
+
+ static nodes_iterator nodes_begin(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.begin());
+ }
+
+ static nodes_iterator nodes_end(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.end());
+ }
+
+ static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+ using TreeEntry = BoUpSLP::TreeEntry;
+
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+ std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+ std::string Str;
+ raw_string_ostream OS(Str);
+ if (isSplat(Entry->Scalars)) {
+ OS << "<splat> " << *Entry->Scalars[0];
+ return Str;
+ }
+ for (auto V : Entry->Scalars) {
+ OS << *V;
+ if (std::any_of(
+ R->ExternalUses.begin(), R->ExternalUses.end(),
+ [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+ OS << " <extract>";
+ OS << "\n";
+ }
+ return Str;
+ }
+
+ static std::string getNodeAttributes(const TreeEntry *Entry,
+ const BoUpSLP *) {
+ if (Entry->NeedToGather)
+ return "color=red";
+ return "";
+ }
+};
+
+} // end namespace llvm
+
+BoUpSLP::~BoUpSLP() {
+ for (const auto &Pair : DeletedInstructions) {
+ // Replace operands of ignored instructions with Undefs in case if they were
+ // marked for deletion.
+ if (Pair.getSecond()) {
+ Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+ Pair.getFirst()->replaceAllUsesWith(Undef);
+ }
+ Pair.getFirst()->dropAllReferences();
+ }
+ for (const auto &Pair : DeletedInstructions) {
+ assert(Pair.getFirst()->use_empty() &&
+ "trying to erase instruction with users.");
+ Pair.getFirst()->eraseFromParent();
+ }
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+ for (auto *V : AV) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ eraseInstruction(I, /*ReplaceWithUndef=*/true);
+ };
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst) {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst) {
+ deleteTree();
+ UserIgnoreList = UserIgnoreLst;
+ if (!allSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0, EdgeInfo());
+
+ // Collect the values that we need to extract from the tree.
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
+
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+ int FoundLane = Lane;
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(Entry->ReuseShuffleIndices.begin(),
+ llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+ }
+
+ // Check if the scalar is externally used as an extra arg.
+ auto ExtI = ExternallyUsedValues.find(Scalar);
+ if (ExtI != ExternallyUsedValues.end()) {
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+ }
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+
+ Instruction *UserInst = dyn_cast<Instruction>(U);
+ if (!UserInst)
+ continue;
+
+ // Skip in-tree scalars that become vectors
+ if (TreeEntry *UseEntry = getTreeEntry(U)) {
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(!UseEntry->NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
+ // Ignore users in the user ignore list.
+ if (is_contained(UserIgnoreList, UserInst))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+ }
+ }
+ }
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+ const EdgeInfo &UserTreeIdx) {
+ assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+ InstructionsState S = getSameOpcode(VL);
+ if (Depth == RecursionMaxDepth) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // Don't handle vectors.
+ if (S.OpValue->getType()->isVectorTy()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
+ if (SI->getValueOperand()->getType()->isVectorTy()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // We now know that this is a vector of instructions of the same type from
+ // the same block.
+
+ // Don't vectorize ephemeral values.
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // Check if this is a duplicate of another entry.
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (!E->isSame(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ // Record the reuse of the tree node. FIXME, currently this is only used to
+ // properly draw the graph rather than for the actual vectorization.
+ E->UserTreeIndices.push_back(UserTreeIdx);
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
+ return;
+ }
+
+ // Check that none of the instructions in the bundle are already in the tree.
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (getTreeEntry(I)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is already in tree.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // If any of the scalars is marked as a value that needs to stay scalar, then
+ // we need to gather the scalars.
+ // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
+ for (Value *V : VL) {
+ if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // Check that all of the users of the scalars that we want to vectorize are
+ // schedulable.
+ auto *VL0 = cast<Instruction>(S.OpValue);
+ BasicBlock *BB = VL0->getParent();
+
+ if (!DT->isReachableFromEntry(BB)) {
+ // Don't go into unreachable blocks. They may contain instructions with
+ // dependency cycles which confuse the final scheduling.
+ LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ if (NumUniqueScalarValues == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (NumUniqueScalarValues <= 1 ||
+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ VL = UniqueValues;
+ }
+
+ auto &BSRef = BlocksSchedules[BB];
+ if (!BSRef)
+ BSRef = std::make_unique<BlockScheduling>(BB);
+
+ BlockScheduling &BS = *BSRef.get();
+
+ Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ if (!Bundle) {
+ LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ assert((!BS.getScheduleData(VL0) ||
+ !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+ "tryScheduleBundle should cancelScheduling on failure");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ auto *PH = cast<PHINode>(VL0);
+
+ // Check for terminator values (e.g. invoke).
+ for (unsigned j = 0; j < VL.size(); ++j)
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ Instruction *Term = dyn_cast<Instruction>(
+ cast<PHINode>(VL[j])->getIncomingValueForBlock(
+ PH->getIncomingBlock(i)));
+ if (Term && Term->isTerminator()) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: Need to swizzle PHINodes (terminator use).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+ // Keeps the reordered operands to avoid code duplication.
+ SmallVector<ValueList, 2> OperandsVec;
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *j : VL)
+ Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
+ PH->getIncomingBlock(i)));
+ TE->setOperand(i, Operands);
+ OperandsVec.push_back(Operands);
+ }
+ for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+ buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
+ return;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+ if (Reuse) {
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
+ return;
+ }
+ if (!CurrentOrder.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ auto StoredCurrentOrderAndNum =
+ NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++StoredCurrentOrderAndNum->getSecond();
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies,
+ StoredCurrentOrderAndNum->getFirst());
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
+ return;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ Type *ScalarTy = VL0->getType();
+
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ return;
+ }
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ auto POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
+ if (!L->isSimple()) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ return;
+ }
+ *POIter = L->getPointerOperand();
+ ++POIter;
+ }
+
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted loads are consecutive.
+ if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original loads are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ } else {
+ // Need to reorder.
+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++I->getSecond();
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+ }
+ return;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering casts with different src types.\n");
+ return;
+ }
+ }
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
+ Type *ComparedTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ CmpInst *Cmp = cast<CmpInst>(V);
+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
+ Cmp->getOperand(0)->getType() != ComparedTy) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering cmp with different predicate.\n");
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+ ValueList Left, Right;
+ if (cast<CmpInst>(VL0)->isCommutative()) {
+ // Commutative predicate - collect + sort operands of the instructions
+ // so that each side is more likely to have the same opcode.
+ assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ } else {
+ // Collect operands - commute if it uses the swapped predicate.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ if (Cmp->getPredicate() != P0)
+ std::swap(LHS, RHS);
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
+ }
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+ case Instruction::Select:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
+
+ // Sort operands of the instructions so that each side is more likely to
+ // have the same opcode.
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (Value *V : VL) {
+ if (cast<Instruction>(V)->getNumOperands() != 2) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty0 != CurTy) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (different types).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ for (Value *V : VL) {
+ auto Op = cast<Instruction>(V)->getOperand(1);
+ if (!isa<ConstantInt>(Op)) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = 2; i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::Store: {
+ // Check if the stores are consecutive or if we need to swizzle them.
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ return;
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+
+ ValueList Operands;
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(0));
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ return;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic.
+ CallInst *CI = cast<CallInst>(VL0);
+ // Check if this is an Intrinsic call or something that can be
+ // represented by an intrinsic call
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (!isTriviallyVectorizable(ID)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return;
+ }
+ Function *Int = CI->getCalledFunction();
+ unsigned NumArgs = CI->getNumArgOperands();
+ SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
+ for (unsigned j = 0; j != NumArgs; ++j)
+ if (hasVectorInstrinsicScalarOpd(ID, j))
+ ScalarArgs[j] = CI->getArgOperand(j);
+ for (Value *V : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(V);
+ if (!CI2 || CI2->getCalledFunction() != Int ||
+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
+ << "\n");
+ return;
+ }
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized.
+ for (unsigned j = 0; j != NumArgs; ++j) {
+ if (hasVectorInstrinsicScalarOpd(ID, j)) {
+ Value *A1J = CI2->getArgOperand(j);
+ if (ScalarArgs[j] != A1J) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << ScalarArgs[j] << "!=" << A1J
+ << "\n");
+ return;
+ }
+ }
+ }
+ // Verify that the bundle operands are identical between the two calls.
+ if (CI->hasOperandBundles() &&
+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+ CI->op_begin() + CI->getBundleOperandsEndIndex(),
+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+ << *CI << "!=" << *V << '\n');
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL) {
+ auto *CI2 = cast<CallInst>(V);
+ Operands.push_back(CI2->getArgOperand(i));
+ }
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!S.isAltShuffle()) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return;
+ }
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+ // Reorder operands if reordering would enable vectorization.
+ if (isa<BinaryOperator>(VL0)) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ default:
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return;
+ }
+}
+
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+ unsigned N;
+ Type *EltTy;
+ auto *ST = dyn_cast<StructType>(T);
+ if (ST) {
+ N = ST->getNumElements();
+ EltTy = *ST->element_begin();
+ } else {
+ N = cast<ArrayType>(T)->getNumElements();
+ EltTy = cast<ArrayType>(T)->getElementType();
+ }
+ if (!isValidElementType(EltTy))
+ return 0;
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ return 0;
+ if (ST) {
+ // Check that struct is homogeneous.
+ for (const auto *Ty : ST->elements())
+ if (Ty != EltTy)
+ return 0;
+ }
+ return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const {
+ Instruction *E0 = cast<Instruction>(OpValue);
+ assert(E0->getOpcode() == Instruction::ExtractElement ||
+ E0->getOpcode() == Instruction::ExtractValue);
+ assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *Vec = E0->getOperand(0);
+
+ CurrentOrder.clear();
+
+ // We have to extract from a vector/aggregate with the same number of elements.
+ unsigned NElts;
+ if (E0->getOpcode() == Instruction::ExtractValue) {
+ const DataLayout &DL = E0->getModule()->getDataLayout();
+ NElts = canMapToVector(Vec->getType(), DL);
+ if (!NElts)
+ return false;
+ // Check if load can be rewritten as load of vector.
+ LoadInst *LI = dyn_cast<LoadInst>(Vec);
+ if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+ return false;
+ } else {
+ NElts = Vec->getType()->getVectorNumElements();
+ }
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ bool ShouldKeepOrder = true;
+ unsigned E = VL.size();
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E + 1);
+ unsigned I = 0;
+ for (; I < E; ++I) {
+ auto *Inst = cast<Instruction>(VL[I]);
+ if (Inst->getOperand(0) != Vec)
+ break;
+ Optional<unsigned> Idx = getExtractIndex(Inst);
+ if (!Idx)
+ break;
+ const unsigned ExtIdx = *Idx;
+ if (ExtIdx != I) {
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ break;
+ ShouldKeepOrder = false;
+ CurrentOrder[ExtIdx] = I;
+ } else {
+ if (CurrentOrder[I] != E + 1)
+ break;
+ CurrentOrder[I] = I;
+ }
+ }
+ if (I < E) {
+ CurrentOrder.clear();
+ return false;
+ }
+
+ return ShouldKeepOrder;
+}
+
+bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
+ return I->hasOneUse() ||
+ std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0;
+ });
+}
+
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+ ArrayRef<Value*> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ // If we have computed a smaller type for the expression, update VecTy so
+ // that the costs will be accurate.
+ if (MinBWs.count(VL[0]))
+ VecTy = VectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+
+ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ int ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ if (E->NeedToGather) {
+ if (allConstant(VL))
+ return 0;
+ if (isSplat(VL)) {
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ }
+ if (E->getOpcode() == Instruction::ExtractElement &&
+ allSameType(VL) && allSameBlock(VL)) {
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (ShuffleKind.hasValue()) {
+ int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (areAllUsersVectorized(cast<Instruction>(V)) &&
+ !ScalarToTreeEntry.count(V)) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ IO->getZExtValue());
+ }
+ }
+ return ReuseShuffleCost + Cost;
+ }
+ }
+ return ReuseShuffleCost + getGatherCost(VL);
+ }
+ assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = E->getMainOp();
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ switch (ShuffleOrOp) {
+ case Instruction::PHI:
+ return 0;
+
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement:
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(VL[I])->getIndexOperand());
+ Idx = IO->getZExtValue();
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ } else {
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
+ }
+ }
+ Idx = ReuseShuffleNumbers;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Idx = IO->getZExtValue();
+ } else {
+ --Idx;
+ }
+ ReuseShuffleCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
+ if (!E->NeedToGather) {
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *E = cast<Instruction>(VL[i]);
+ // If all users are going to be vectorized, instruction can be
+ // considered as dead.
+ // The same, if have only one user, it will be vectorized for sure.
+ if (areAllUsersVectorized(E)) {
+ // Take credit for instruction that will become dead.
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted separately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+ continue;
+ }
+ }
+ DeadCost -=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
+ }
+ return DeadCost;
+ }
+ return ReuseShuffleCost + getGatherCost(VL);
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ int ScalarEltCost =
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+
+ // Calculate the cost of this instruction.
+ int ScalarCost = VL.size() * ScalarEltCost;
+
+ VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ int VecCost = 0;
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ VecCost = ReuseShuffleCost +
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
+ }
+ return VecCost - ScalarCost;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select: {
+ // Calculate the cost of this instruction.
+ int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ Builder.getInt1Ty(), VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_PowerOf2;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt0 = nullptr;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
+ if (!CInt) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ Op2VP = TargetTransformInfo::OP_None;
+ break;
+ }
+ if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+ !CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_None;
+ if (i == 0) {
+ CInt0 = CInt;
+ continue;
+ }
+ if (CInt0 != CInt)
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ }
+
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ int ScalarEltCost = TTI->getArithmeticInstrCost(
+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ int ScalarEltCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::Load: {
+ // Cost of wide load - cost of scalar loads.
+ unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+ int ScalarEltCost =
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecLdCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ return ReuseShuffleCost + VecLdCost - ScalarLdCost;
+ }
+ case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
+ unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+ int ScalarEltCost =
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecStCost =
+ TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+ return ReuseShuffleCost + VecStCost - ScalarStCost;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ SmallVector<Type *, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
+ ScalarTys.push_back(CI->getArgOperand(op)->getType());
+
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+ FMF = FPMO->getFastMathFlags();
+
+ int ScalarEltCost =
+ TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+
+ SmallVector<Value *, 4> Args(CI->arg_operands());
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+ VecTy->getNumElements());
+
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
+
+ return ReuseShuffleCost + VecCallCost - ScalarCallCost;
+ }
+ case Instruction::ShuffleVector: {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
+ int ScalarCost = 0;
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ ReuseShuffleCost -= TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ ReuseShuffleCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
+ int VecCost = 0;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
+ } else {
+ Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+ Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+ VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
+ VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
+ }
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ default:
+ llvm_unreachable("Unknown instruction");
+ }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() const {
+ LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+ << VectorizableTree.size() << " is fully vectorizable .\n");
+
+ // We only handle trees of heights 1 and 2.
+ if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)
+ return true;
+
+ if (VectorizableTree.size() != 2)
+ return false;
+
+ // Handle splat and all-constants stores.
+ if (!VectorizableTree[0]->NeedToGather &&
+ (allConstant(VectorizableTree[1]->Scalars) ||
+ isSplat(VectorizableTree[1]->Scalars)))
+ return true;
+
+ // Gathering cost would be too much for tiny trees.
+ if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
+ return false;
+
+ return true;
+}
+
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+ if (RdxOpcode != Instruction::Or)
+ return false;
+
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+ // Look past the reduction to find a source value. Arbitrarily follow the
+ // path through operand 0 of any 'or'. Also, peek through optional
+ // shift-left-by-constant.
+ Value *ZextLoad = FirstReduced;
+ while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+ // Check if the input to the reduction is an extended load.
+ Value *LoadPtr;
+ if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ return false;
+
+ // Require that the total load bit width is a legal integer type.
+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+ Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+ LLVMContext &Context = FirstReduced->getContext();
+ if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+ return false;
+
+ // Everything matched - assume that we can fold the whole sequence using
+ // load combining.
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+ << *(cast<Instruction>(FirstReduced)) << "\n");
+
+ return true;
+}
+
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+ // We can vectorize the tree if its size is greater than or equal to the
+ // minimum size specified by the MinTreeSize command line option.
+ if (VectorizableTree.size() >= MinTreeSize)
+ return false;
+
+ // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+ // can vectorize it if we can prove it fully vectorizable.
+ if (isFullyVectorizableTinyTree())
+ return false;
+
+ assert(VectorizableTree.empty()
+ ? ExternalUses.empty()
+ : true && "We shouldn't have any external users");
+
+ // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+ // vectorizable.
+ return true;
+}
+
+int BoUpSLP::getSpillCost() const {
+ // Walk from the bottom of the tree to the top, tracking which values are
+ // live. When we see a call instruction that is not part of our tree,
+ // query TTI to see if there is a cost to keeping values live over it
+ // (for example, if spills and fills are required).
+ unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
+ int Cost = 0;
+
+ SmallPtrSet<Instruction*, 4> LiveValues;
+ Instruction *PrevInst = nullptr;
+
+ for (const auto &TEPtr : VectorizableTree) {
+ Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
+ if (!Inst)
+ continue;
+
+ if (!PrevInst) {
+ PrevInst = Inst;
+ continue;
+ }
+
+ // Update LiveValues.
+ LiveValues.erase(PrevInst);
+ for (auto &J : PrevInst->operands()) {
+ if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+ LiveValues.insert(cast<Instruction>(&*J));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "SLP: #LV: " << LiveValues.size();
+ for (auto *X : LiveValues)
+ dbgs() << " " << X->getName();
+ dbgs() << ", Looking at ";
+ Inst->dump();
+ });
+
+ // Now find the sequence of instructions between PrevInst and Inst.
+ unsigned NumCalls = 0;
+ BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+ PrevInstIt =
+ PrevInst->getIterator().getReverse();
+ while (InstIt != PrevInstIt) {
+ if (PrevInstIt == PrevInst->getParent()->rend()) {
+ PrevInstIt = Inst->getParent()->rbegin();
+ continue;
+ }
+
+ // Debug informations don't impact spill cost.
+ if ((isa<CallInst>(&*PrevInstIt) &&
+ !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ &*PrevInstIt != PrevInst)
+ NumCalls++;
+
+ ++PrevInstIt;
+ }
+
+ if (NumCalls) {
+ SmallVector<Type*, 4> V;
+ for (auto *II : LiveValues)
+ V.push_back(VectorType::get(II->getType(), BundleWidth));
+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+ }
+
+ PrevInst = Inst;
+ }
+
+ return Cost;
+}
+
+int BoUpSLP::getTreeCost() {
+ int Cost = 0;
+ LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+ << VectorizableTree.size() << ".\n");
+
+ unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = *VectorizableTree[I].get();
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundant
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yield different results.
+ //
+ if (TE.NeedToGather &&
+ std::any_of(
+ std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+ return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);
+ }))
+ continue;
+
+ int C = getEntryCost(&TE);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for bundle that starts with " << *TE.Scalars[0]
+ << ".\n");
+ Cost += C;
+ }
+
+ SmallPtrSet<Value *, 16> ExtractCostCalculated;
+ int ExtractCost = 0;
+ for (ExternalUser &EU : ExternalUses) {
+ // We only add extract cost once for the same scalar.
+ if (!ExtractCostCalculated.insert(EU.Scalar).second)
+ continue;
+
+ // Uses by ephemeral values are free (because the ephemeral value will be
+ // removed prior to code generation, and so the extraction will be
+ // removed as well).
+ if (EphValues.count(EU.User))
+ continue;
+
+ // If we plan to rewrite the tree in a smaller type, we will need to sign
+ // extend the extracted value back to the original type. Here, we account
+ // for the extract and the added cost of the sign extend if needed.
+ auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto Extend =
+ MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+ VecTy = VectorType::get(MinTy, BundleWidth);
+ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+ VecTy, EU.Lane);
+ } else {
+ ExtractCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+ }
+ }
+
+ int SpillCost = getSpillCost();
+ Cost += SpillCost + ExtractCost;
+
+ std::string Str;
+ {
+ raw_string_ostream OS(Str);
+ OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+ << "SLP: Extract Cost = " << ExtractCost << ".\n"
+ << "SLP: Total Cost = " << Cost << ".\n";
+ }
+ LLVM_DEBUG(dbgs() << Str);
+
+ if (ViewSLPTree)
+ ViewGraph(this, "SLP" + F->getName(), false, Str);
+
+ return Cost;
+}
+
+int BoUpSLP::getGatherCost(Type *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const {
+ int Cost = 0;
+ for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+ if (!ShuffledIndices.count(i))
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.empty())
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
+ return Cost;
+}
+
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+ // Find the type of the operands in VL.
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ // Find the cost of inserting/extracting values from the vector.
+ // Check if the same elements are inserted several times and count them as
+ // shuffle candidates.
+ DenseSet<unsigned> ShuffledElements;
+ DenseSet<Value *> UniqueElements;
+ // Iterate in reverse order to consider insert elements with the high cost.
+ for (unsigned I = VL.size(); I > 0; --I) {
+ unsigned Idx = I - 1;
+ if (!UniqueElements.insert(VL[Idx]).second)
+ ShuffledElements.insert(Idx);
+ }
+ return getGatherCost(VecTy, ShuffledElements);
+}
+
+// Perform operand reordering on the instructions in VL and return the reordered
+// operands in Left and Right.
+void BoUpSLP::reorderInputsAccordingToOpcode(
+ ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right, const DataLayout &DL,
+ ScalarEvolution &SE) {
+ if (VL.empty())
+ return;
+ VLOperands Ops(VL, DL, SE);
+ // Reorder the operands in place.
+ Ops.reorder();
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+}
+
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+ // Get the basic block this bundle is in. All instructions in the bundle
+ // should be in this block.
+ auto *Front = E->getMainOp();
+ auto *BB = Front->getParent();
+ assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
+ [=](Value *V) -> bool {
+ auto *I = cast<Instruction>(V);
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ }));
+
+ // The last instruction in the bundle in program order.
+ Instruction *LastInst = nullptr;
+
+ // Find the last instruction. The common case should be that BB has been
+ // scheduled, and the last instruction is VL.back(). So we start with
+ // VL.back() and iterate over schedule data until we reach the end of the
+ // bundle. The end of the bundle is marked by null ScheduleData.
+ if (BlocksSchedules.count(BB)) {
+ auto *Bundle =
+ BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+ if (Bundle && Bundle->isPartOfBundle())
+ for (; Bundle; Bundle = Bundle->NextInBundle)
+ if (Bundle->OpValue == Bundle->Inst)
+ LastInst = Bundle->Inst;
+ }
+
+ // LastInst can still be null at this point if there's either not an entry
+ // for BB in BlocksSchedules or there's no ScheduleData available for
+ // VL.back(). This can be the case if buildTree_rec aborts for various
+ // reasons (e.g., the maximum recursion depth is reached, the maximum region
+ // size is reached, etc.). ScheduleData is initialized in the scheduling
+ // "dry-run".
+ //
+ // If this happens, we can still find the last instruction by brute force. We
+ // iterate forwards from Front (inclusive) until we either see all
+ // instructions in the bundle or reach the end of the block. If Front is the
+ // last instruction in program order, LastInst will be set to Front, and we
+ // will visit all the remaining instructions in the block.
+ //
+ // One of the reasons we exit early from buildTree_rec is to place an upper
+ // bound on compile-time. Thus, taking an additional compile-time hit here is
+ // not ideal. However, this should be exceedingly rare since it requires that
+ // we both exit early from buildTree_rec and that the bundle be out-of-order
+ // (causing us to iterate all the way to the end of the block).
+ if (!LastInst) {
+ SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
+ for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
+ if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
+ LastInst = &I;
+ if (Bundle.empty())
+ break;
+ }
+ }
+ assert(LastInst && "Failed to find last instruction in bundle");
+
+ // Set the insertion point after the last instruction in the bundle. Set the
+ // debug location to Front.
+ Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+ Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+}
+
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+ Value *Vec = UndefValue::get(Ty);
+ // Generate the 'InsertElement' instruction.
+ for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
+ Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+ if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
+ GatherSeq.insert(Insrt);
+ CSEBlocks.insert(Insrt->getParent());
+
+ // Add to our 'need-to-extract' list.
+ if (TreeEntry *E = getTreeEntry(VL[i])) {
+ // Find which lane we need to extract.
+ int FoundLane = -1;
+ for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
+ // Is this the lane of the scalar that we are looking for ?
+ if (E->Scalars[Lane] == VL[i]) {
+ FoundLane = Lane;
+ break;
+ }
+ }
+ assert(FoundLane >= 0 && "Could not find the correct lane");
+ if (!E->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(E->ReuseShuffleIndices.begin(),
+ llvm::find(E->ReuseShuffleIndices, FoundLane));
+ }
+ ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+ }
+ }
+ }
+
+ return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+ InstructionsState S = getSameOpcode(VL);
+ if (S.getOpcode()) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ if (E->isSame(VL)) {
+ Value *V = vectorizeTree(E);
+ if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+ // We need to get the vectorized value but without shuffle.
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ V = SV->getOperand(0);
+ } else {
+ // Reshuffle to get only unique values.
+ SmallVector<unsigned, 4> UniqueIdxs;
+ SmallSet<unsigned, 4> UsedIdxs;
+ for(unsigned Idx : E->ReuseShuffleIndices)
+ if (UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Idx);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ UniqueIdxs);
+ }
+ }
+ return V;
+ }
+ }
+ }
+
+ Type *ScalarTy = S.OpValue->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
+ ScalarTy = SI->getValueOperand()->getType();
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ if (VL.size() > 2) {
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second || isa<Constant>(V))
+ UniqueValues.emplace_back(V);
+ }
+ // Do not shuffle single element or if number of unique values is not power
+ // of 2.
+ if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+ !llvm::isPowerOf2_32(UniqueValues.size()))
+ ReuseShuffleIndicies.clear();
+ else
+ VL = UniqueValues;
+ }
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ Value *V = Gather(VL, VecTy);
+ if (!ReuseShuffleIndicies.empty()) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ return V;
+}
+
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<unsigned> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Instruction *VL0 = E->getMainOp();
+ Type *ScalarTy = VL0->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+
+ if (E->NeedToGather) {
+ setInsertPointAfterBundle(E);
+ auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
+
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ auto *PH = cast<PHINode>(VL0);
+ Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ Value *V = NewPhi;
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+
+ // PHINodes may have multiple entries from the same block. We want to
+ // visit every block once.
+ SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ BasicBlock *IBB = PH->getIncomingBlock(i);
+
+ if (!VisitedBBs.insert(IBB).second) {
+ NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+ continue;
+ }
+
+ Builder.SetInsertPoint(IBB->getTerminator());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ Value *Vec = vectorizeTree(E->getOperand(i));
+ NewPhi->addIncoming(Vec, IBB);
+ }
+
+ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+ "Invalid number of incoming values");
+ return V;
+ }
+
+ case Instruction::ExtractElement: {
+ if (!E->NeedToGather) {
+ Value *V = E->getSingleOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
+ setInsertPointAfterBundle(E);
+ auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::ExtractValue: {
+ if (!E->NeedToGather) {
+ LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+ Builder.SetInsertPoint(LI);
+ PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(
+ NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = NewV;
+ return NewV;
+ }
+ setInsertPointAfterBundle(E);
+ auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ setInsertPointAfterBundle(E);
+
+ Value *InVec = vectorizeTree(E->getOperand(0));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ auto *CI = cast<CastInst>(VL0);
+ Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp: {
+ setInsertPointAfterBundle(E);
+
+ Value *L = vectorizeTree(E->getOperand(0));
+ Value *R = vectorizeTree(E->getOperand(1));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ Value *V;
+ if (E->getOpcode() == Instruction::FCmp)
+ V = Builder.CreateFCmp(P0, L, R);
+ else
+ V = Builder.CreateICmp(P0, L, R);
+
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Select: {
+ setInsertPointAfterBundle(E);
+
+ Value *Cond = vectorizeTree(E->getOperand(0));
+ Value *True = vectorizeTree(E->getOperand(1));
+ Value *False = vectorizeTree(E->getOperand(2));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateSelect(Cond, True, False);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FNeg: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op = vectorizeTree(E->getOperand(0));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateUnOp(
+ static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ setInsertPointAfterBundle(E);
+
+ Value *LHS = vectorizeTree(E->getOperand(0));
+ Value *RHS = vectorizeTree(E->getOperand(1));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+ RHS);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Load: {
+ // Loads are inserted at the head of the tree because we don't want to
+ // sink them all the way down past store instructions.
+ bool IsReorder = E->updateStateIfReorder();
+ if (IsReorder)
+ VL0 = E->getMainOp();
+ setInsertPointAfterBundle(E);
+
+ LoadInst *LI = cast<LoadInst>(VL0);
+ Type *ScalarLoadTy = LI->getType();
+ unsigned AS = LI->getPointerAddressSpace();
+
+ Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+ VecTy->getPointerTo(AS));
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ Value *PO = LI->getPointerOperand();
+ if (getTreeEntry(PO))
+ ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+
+ MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
+ LI = Builder.CreateLoad(VecTy, VecPtr);
+ if (!Alignment)
+ Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
+ LI->setAlignment(Alignment);
+ Value *V = propagateMetadata(LI, E->Scalars);
+ if (IsReorder) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ Mask, "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(VL0);
+ unsigned Alignment = SI->getAlignment();
+ unsigned AS = SI->getPointerAddressSpace();
+
+ setInsertPointAfterBundle(E);
+
+ Value *VecValue = vectorizeTree(E->getOperand(0));
+ Value *ScalarPtr = SI->getPointerOperand();
+ Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
+ StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
+
+ // The pointer operand uses an in-tree scalar, so add the new BitCast to
+ // ExternalUses to make sure that an extract will be generated in the
+ // future.
+ if (getTreeEntry(ScalarPtr))
+ ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+
+ if (!Alignment)
+ Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+
+ ST->setAlignment(Align(Alignment));
+ Value *V = propagateMetadata(ST, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::GetElementPtr: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op0 = vectorizeTree(E->getOperand(0));
+
+ std::vector<Value *> OpVecs;
+ for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+ ++j) {
+ Value *OpVec = vectorizeTree(E->getOperand(j));
+ OpVecs.push_back(OpVec);
+ }
+
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ setInsertPointAfterBundle(E);
+
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (Function *FI = CI->getCalledFunction())
+ IID = FI->getIntrinsicID();
+
+ Value *ScalarArg = nullptr;
+ std::vector<Value *> OpVecs;
+ for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+ ValueList OpVL;
+ // Some intrinsics have scalar arguments. This argument should not be
+ // vectorized.
+ if (hasVectorInstrinsicScalarOpd(IID, j)) {
+ CallInst *CEI = cast<CallInst>(VL0);
+ ScalarArg = CEI->getArgOperand(j);
+ OpVecs.push_back(CEI->getArgOperand(j));
+ continue;
+ }
+
+ Value *OpVec = vectorizeTree(E->getOperand(j));
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ OpVecs.push_back(OpVec);
+ }
+
+ Module *M = F->getParent();
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+ Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && getTreeEntry(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::ShuffleVector: {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
+
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ setInsertPointAfterBundle(E);
+ LHS = vectorizeTree(E->getOperand(0));
+ RHS = vectorizeTree(E->getOperand(1));
+ } else {
+ setInsertPointAfterBundle(E);
+ LHS = vectorizeTree(E->getOperand(0));
+ }
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else {
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+ }
+
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up main and alt scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OpScalars, AltScalars;
+ unsigned e = E->Scalars.size();
+ SmallVector<Constant *, 8> Mask(e);
+ for (unsigned i = 0; i < e; ++i) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == E->getAltOpcode()) {
+ Mask[i] = Builder.getInt32(e + i);
+ AltScalars.push_back(E->Scalars[i]);
+ } else {
+ Mask[i] = Builder.getInt32(i);
+ OpScalars.push_back(E->Scalars[i]);
+ }
+ }
+
+ Value *ShuffleMask = ConstantVector::get(Mask);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
+
+ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ default:
+ llvm_unreachable("unknown inst");
+ }
+ return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+ // All blocks must be scheduled before any instructions are inserted.
+ for (auto &BSIter : BlocksSchedules) {
+ scheduleBlock(BSIter.second.get());
+ }
+
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+
+ // If the vectorized tree can be rewritten in a smaller type, we truncate the
+ // vectorized root. InstCombine will then rewrite the entire expression. We
+ // sign extend the extracted values below.
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ if (auto *I = dyn_cast<Instruction>(VectorRoot))
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+ auto BundleWidth = VectorizableTree[0]->Scalars.size();
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto *VecTy = VectorType::get(MinTy, BundleWidth);
+ auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+ VectorizableTree[0]->VectorizedValue = Trunc;
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+ << " values .\n");
+
+ // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+ // specified by ScalarType.
+ auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+ if (!MinBWs.count(ScalarRoot))
+ return Ex;
+ if (MinBWs[ScalarRoot].second)
+ return Builder.CreateSExt(Ex, ScalarType);
+ return Builder.CreateZExt(Ex, ScalarType);
+ };
+
+ // Extract all of the elements with the external uses.
+ for (const auto &ExternalUse : ExternalUses) {
+ Value *Scalar = ExternalUse.Scalar;
+ llvm::User *User = ExternalUse.User;
+
+ // Skip users that we already RAUW. This happens when one instruction
+ // has multiple uses of the same value.
+ if (User && !is_contained(Scalar->users(), User))
+ continue;
+ TreeEntry *E = getTreeEntry(Scalar);
+ assert(E && "Invalid scalar");
+ assert(!E->NeedToGather && "Extracting from a gather list");
+
+ Value *Vec = E->VectorizedValue;
+ assert(Vec && "Can't find vectorizable value");
+
+ Value *Lane = Builder.getInt32(ExternalUse.Lane);
+ // If User == nullptr, the Scalar is used as extra arg. Generate
+ // ExtractElement instruction and update the record for this scalar in
+ // ExternallyUsedValues.
+ if (!User) {
+ assert(ExternallyUsedValues.count(Scalar) &&
+ "Scalar with nullptr as an external user must be registered in "
+ "ExternallyUsedValues map");
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ }
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+ auto &Locs = ExternallyUsedValues[Scalar];
+ ExternallyUsedValues.insert({Ex, Locs});
+ ExternallyUsedValues.erase(Scalar);
+ // Required to update internally referenced instructions.
+ Scalar->replaceAllUsesWith(Ex);
+ continue;
+ }
+
+ // Generate extracts for out-of-tree users.
+ // Find the insertion point for the extractelement lane.
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+ if (PHINode *PH = dyn_cast<PHINode>(User)) {
+ for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+ if (PH->getIncomingValue(i) == Scalar) {
+ Instruction *IncomingTerminator =
+ PH->getIncomingBlock(i)->getTerminator();
+ if (isa<CatchSwitchInst>(IncomingTerminator)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ }
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(PH->getIncomingBlock(i));
+ PH->setOperand(i, Ex);
+ }
+ }
+ } else {
+ Builder.SetInsertPoint(cast<Instruction>(User));
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(cast<Instruction>(User)->getParent());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(&F->getEntryBlock());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ }
+
+ // For each vectorized value:
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
+
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+
+#ifndef NDEBUG
+ Type *Ty = Scalar->getType();
+ if (!Ty->isVoidTy()) {
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+ // It is legal to delete users in the ignorelist.
+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+ "Deleting out-of-tree value");
+ }
+ }
+#endif
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ eraseInstruction(cast<Instruction>(Scalar));
+ }
+ }
+
+ Builder.ClearInsertionPoint();
+
+ return VectorizableTree[0]->VectorizedValue;
+}
+
+void BoUpSLP::optimizeGatherSequence() {
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
+ // LICM InsertElementInst sequences.
+ for (Instruction *I : GatherSeq) {
+ if (isDeleted(I))
+ continue;
+
+ // Check if this block is inside a loop.
+ Loop *L = LI->getLoopFor(I->getParent());
+ if (!L)
+ continue;
+
+ // Check if it has a preheader.
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ if (!PreHeader)
+ continue;
+
+ // If the vector or the element that we insert into it are
+ // instructions that are defined in this basic block then we can't
+ // hoist this instruction.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (Op0 && L->contains(Op0))
+ continue;
+ if (Op1 && L->contains(Op1))
+ continue;
+
+ // We can hoist this instruction. Move it to the pre-header.
+ I->moveBefore(PreHeader->getTerminator());
+ }
+
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ llvm::stable_sort(CSEWorkList,
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
+ // Perform O(N^2) search over the gather sequences and merge identical
+ // instructions. TODO: We can further optimize this scan if we split the
+ // instructions into different buckets based on the insert lane.
+ SmallVector<Instruction *, 16> Visited;
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
+ // For all instructions in blocks containing gather sequences:
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+ Instruction *In = &*it++;
+ if (isDeleted(In))
+ continue;
+ if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ for (Instruction *v : Visited) {
+ if (In->isIdenticalTo(v) &&
+ DT->dominates(v->getParent(), In->getParent())) {
+ In->replaceAllUsesWith(v);
+ eraseInstruction(In);
+ In = nullptr;
+ break;
+ }
+ }
+ if (In) {
+ assert(!is_contained(Visited, In));
+ Visited.push_back(In);
+ }
+ }
+ }
+ CSEBlocks.clear();
+ GatherSeq.clear();
+}
+
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S) {
+ if (isa<PHINode>(S.OpValue))
+ return nullptr;
+
+ // Initialize the instruction bundle.
+ Instruction *OldScheduleEnd = ScheduleEnd;
+ ScheduleData *PrevInBundle = nullptr;
+ ScheduleData *Bundle = nullptr;
+ bool ReSchedule = false;
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
+
+ // Make sure that the scheduling region contains all
+ // instructions of the bundle.
+ for (Value *V : VL) {
+ if (!extendSchedulingRegion(V, S))
+ return None;
+ }
+
+ for (Value *V : VL) {
+ ScheduleData *BundleMember = getScheduleData(V);
+ assert(BundleMember &&
+ "no ScheduleData for bundle member (maybe not in same basic block)");
+ if (BundleMember->IsScheduled) {
+ // A bundle member was scheduled as single instruction before and now
+ // needs to be scheduled as part of the bundle. We just get rid of the
+ // existing schedule.
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
+ ReSchedule = true;
+ }
+ assert(BundleMember->isSchedulingEntity() &&
+ "bundle member already part of other bundle");
+ if (PrevInBundle) {
+ PrevInBundle->NextInBundle = BundleMember;
+ } else {
+ Bundle = BundleMember;
+ }
+ BundleMember->UnscheduledDepsInBundle = 0;
+ Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+ // Group the instructions to a bundle.
+ BundleMember->FirstInBundle = Bundle;
+ PrevInBundle = BundleMember;
+ }
+ if (ScheduleEnd != OldScheduleEnd) {
+ // The scheduling region got new instructions at the lower end (or it is a
+ // new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // It is seldom that this needs to be done a second time after adding the
+ // initial bundle to the region.
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [](ScheduleData *SD) {
+ SD->clearDependencies();
+ });
+ }
+ ReSchedule = true;
+ }
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+ assert(Bundle && "Failed to find schedule bundle");
+
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
+
+ calculateDependencies(Bundle, true, SLP);
+
+ // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+ // means that there are no cyclic dependencies and we can schedule it.
+ // Note that's important that we don't "schedule" the bundle yet (see
+ // cancelScheduling).
+ while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
+ ScheduleData *pickedSD = ReadyInsts.back();
+ ReadyInsts.pop_back();
+
+ if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+ schedule(pickedSD, ReadyInsts);
+ }
+ }
+ if (!Bundle->isReady()) {
+ cancelScheduling(VL, S.OpValue);
+ return None;
+ }
+ return Bundle;
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
+ Value *OpValue) {
+ if (isa<PHINode>(OpValue))
+ return;
+
+ ScheduleData *Bundle = getScheduleData(OpValue);
+ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ assert(!Bundle->IsScheduled &&
+ "Can't cancel bundle which is already scheduled");
+ assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ "tried to unbundle something which is not a bundle");
+
+ // Un-bundle: make single instructions out of the bundle.
+ ScheduleData *BundleMember = Bundle;
+ while (BundleMember) {
+ assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+ BundleMember->FirstInBundle = BundleMember;
+ ScheduleData *Next = BundleMember->NextInBundle;
+ BundleMember->NextInBundle = nullptr;
+ BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+ if (BundleMember->UnscheduledDepsInBundle == 0) {
+ ReadyInsts.insert(BundleMember);
+ }
+ BundleMember = Next;
+ }
+}
+
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ return &(ScheduleDataChunks.back()[ChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+ const InstructionsState &S) {
+ if (getScheduleData(V, isOneOf(S, V)))
+ return true;
+ Instruction *I = dyn_cast<Instruction>(V);
+ assert(I && "bundle member must be an instruction");
+ assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, S.OpValue);
+ ExtraScheduleDataMap[I][S.OpValue] = SD;
+ return true;
+ };
+ if (CheckSheduleForI(I))
+ return true;
+ if (!ScheduleStart) {
+ // It's the first instruction in the new region.
+ initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+ ScheduleStart = I;
+ ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ assert(ScheduleEnd && "tried to vectorize a terminator?");
+ LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ return true;
+ }
+ // Search up and down at the same time, because we don't know if the new
+ // instruction is above or below the existing scheduling region.
+ BasicBlock::reverse_iterator UpIter =
+ ++ScheduleStart->getIterator().getReverse();
+ BasicBlock::reverse_iterator UpperEnd = BB->rend();
+ BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
+ BasicBlock::iterator LowerEnd = BB->end();
+ while (true) {
+ if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+ LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ return false;
+ }
+
+ if (UpIter != UpperEnd) {
+ if (&*UpIter == I) {
+ initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+ ScheduleStart = I;
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
+ return true;
+ }
+ ++UpIter;
+ }
+ if (DownIter != LowerEnd) {
+ if (&*DownIter == I) {
+ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+ nullptr);
+ ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ assert(ScheduleEnd && "tried to vectorize a terminator?");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
+ << "\n");
+ return true;
+ }
+ ++DownIter;
+ }
+ assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+ "instruction not found in block");
+ }
+ return true;
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+ Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore) {
+ ScheduleData *CurrentLoadStore = PrevLoadStore;
+ for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+ ScheduleData *SD = ScheduleDataMap[I];
+ if (!SD) {
+ SD = allocateScheduleDataChunks();
+ ScheduleDataMap[I] = SD;
+ SD->Inst = I;
+ }
+ assert(!isInSchedulingRegion(SD) &&
+ "new ScheduleData already in scheduling region");
+ SD->init(SchedulingRegionID, I);
+
+ if (I->mayReadOrWriteMemory() &&
+ (!isa<IntrinsicInst>(I) ||
+ cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
+ // Update the linked list of memory accessing instructions.
+ if (CurrentLoadStore) {
+ CurrentLoadStore->NextLoadStore = SD;
+ } else {
+ FirstLoadStoreInRegion = SD;
+ }
+ CurrentLoadStore = SD;
+ }
+ }
+ if (NextLoadStore) {
+ if (CurrentLoadStore)
+ CurrentLoadStore->NextLoadStore = NextLoadStore;
+ } else {
+ LastLoadStoreInRegion = CurrentLoadStore;
+ }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+ bool InsertInReadyList,
+ BoUpSLP *SLP) {
+ assert(SD->isSchedulingEntity());
+
+ SmallVector<ScheduleData *, 10> WorkList;
+ WorkList.push_back(SD);
+
+ while (!WorkList.empty()) {
+ ScheduleData *SD = WorkList.back();
+ WorkList.pop_back();
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ assert(isInSchedulingRegion(BundleMember));
+ if (!BundleMember->hasValidDependencies()) {
+
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
+ BundleMember->Dependencies = 0;
+ BundleMember->resetUnscheduledDeps();
+
+ // Handle def-use chain dependencies.
+ if (BundleMember->OpValue != BundleMember->Inst) {
+ ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ }
+ }
+
+ // Handle the memory dependencies.
+ ScheduleData *DepDest = BundleMember->NextLoadStore;
+ if (DepDest) {
+ Instruction *SrcInst = BundleMember->Inst;
+ MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
+ bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+ unsigned numAliased = 0;
+ unsigned DistToSrc = 1;
+
+ while (DepDest) {
+ assert(isInSchedulingRegion(DepDest));
+
+ // We have two limits to reduce the complexity:
+ // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+ // SLP->isAliased (which is the expensive part in this loop).
+ // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+ // the whole loop (even if the loop is fast, it's quadratic).
+ // It's important for the loop break condition (see below) to
+ // check this limit even between two read-only instructions.
+ if (DistToSrc >= MaxMemDepDistance ||
+ ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+ (numAliased >= AliasedCheckLimit ||
+ SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+ // We increment the counter only if the locations are aliased
+ // (instead of counting all alias checks). This gives a better
+ // balance between reduced runtime and accurate dependencies.
+ numAliased++;
+
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ DepDest = DepDest->NextLoadStore;
+
+ // Example, explaining the loop break condition: Let's assume our
+ // starting instruction is i0 and MaxMemDepDistance = 3.
+ //
+ // +--------v--v--v
+ // i0,i1,i2,i3,i4,i5,i6,i7,i8
+ // +--------^--^--^
+ //
+ // MaxMemDepDistance let us stop alias-checking at i3 and we add
+ // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+ // Previously we already added dependencies from i3 to i6,i7,i8
+ // (because of MaxMemDepDistance). As we added a dependency from
+ // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+ // and we can abort this loop at i6.
+ if (DistToSrc >= 2 * MaxMemDepDistance)
+ break;
+ DistToSrc++;
+ }
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ if (InsertInReadyList && SD->isReady()) {
+ ReadyInsts.push_back(SD);
+ LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
+ << "\n");
+ }
+ }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+ assert(ScheduleStart &&
+ "tried to reset schedule on block which has not been scheduled");
+ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ assert(isInSchedulingRegion(SD) &&
+ "ScheduleData not in scheduling region");
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ });
+ }
+ ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+ if (!BS->ScheduleStart)
+ return;
+
+ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+ BS->resetSchedule();
+
+ // For the real scheduling we use a more sophisticated ready-list: it is
+ // sorted by the original instruction location. This lets the final schedule
+ // be as close as possible to the original instruction order.
+ struct ScheduleDataCompare {
+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
+ return SD2->SchedulingPriority < SD1->SchedulingPriority;
+ }
+ };
+ std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+ // Ensure that all dependency data is updated and fill the ready-list with
+ // initial instructions.
+ int Idx = 0;
+ int NumToSchedule = 0;
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+ I = I->getNextNode()) {
+ BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ assert(SD->isPartOfBundle() ==
+ (getTreeEntry(SD->Inst) != nullptr) &&
+ "scheduler and vectorizer bundle mismatch");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ });
+ }
+ BS->initialFillReadyList(ReadyInsts);
+
+ Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+ // Do the "real" scheduling.
+ while (!ReadyInsts.empty()) {
+ ScheduleData *picked = *ReadyInsts.begin();
+ ReadyInsts.erase(ReadyInsts.begin());
+
+ // Move the scheduled instruction(s) to their dedicated places, if not
+ // there yet.
+ ScheduleData *BundleMember = picked;
+ while (BundleMember) {
+ Instruction *pickedInst = BundleMember->Inst;
+ if (LastScheduledInst->getNextNode() != pickedInst) {
+ BS->BB->getInstList().remove(pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+ pickedInst);
+ }
+ LastScheduledInst = pickedInst;
+ BundleMember = BundleMember->NextInBundle;
+ }
+
+ BS->schedule(picked, ReadyInsts);
+ NumToSchedule--;
+ }
+ assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Avoid duplicate scheduling of the block.
+ BS->ScheduleStart = nullptr;
+}
+
+unsigned BoUpSLP::getVectorElementSize(Value *V) const {
+ // If V is a store, just return the width of the stored value without
+ // traversing the expression tree. This is the common case.
+ if (auto *Store = dyn_cast<StoreInst>(V))
+ return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+
+ // If V is not a store, we can traverse the expression tree to find loads
+ // that feed it. The type of the loaded value may indicate a more suitable
+ // width than V's type. We want to base the vector element size on the width
+ // of memory operations where possible.
+ SmallVector<Instruction *, 16> Worklist;
+ SmallPtrSet<Instruction *, 16> Visited;
+ if (auto *I = dyn_cast<Instruction>(V))
+ Worklist.push_back(I);
+
+ // Traverse the expression tree in bottom-up order looking for loads. If we
+ // encounter an instruction we don't yet handle, we give up.
+ auto MaxWidth = 0u;
+ auto FoundUnknownInst = false;
+ while (!Worklist.empty() && !FoundUnknownInst) {
+ auto *I = Worklist.pop_back_val();
+ Visited.insert(I);
+
+ // We should only be looking at scalar instructions here. If the current
+ // instruction has a vector type, give up.
+ auto *Ty = I->getType();
+ if (isa<VectorType>(Ty))
+ FoundUnknownInst = true;
+
+ // If the current instruction is a load, update MaxWidth to reflect the
+ // width of the loaded value.
+ else if (isa<LoadInst>(I))
+ MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+ // Otherwise, we need to visit the operands of the instruction. We only
+ // handle the interesting cases from buildTree here. If an operand is an
+ // instruction we haven't yet visited, we add it to the worklist.
+ else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+ isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (!Visited.count(J))
+ Worklist.push_back(J);
+ }
+
+ // If we don't yet handle the instruction, give up.
+ else
+ FoundUnknownInst = true;
+ }
+
+ // If we didn't encounter a memory access in the expression tree, or if we
+ // gave up for some reason, just return the width of V.
+ if (!MaxWidth || FoundUnknownInst)
+ return DL->getTypeSizeInBits(V->getType());
+
+ // Otherwise, return the maximum width we found.
+ return MaxWidth;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+ SmallVectorImpl<Value *> &ToDemote,
+ SmallVectorImpl<Value *> &Roots) {
+ // We can always demote constants.
+ if (isa<Constant>(V)) {
+ ToDemote.push_back(V);
+ return true;
+ }
+
+ // If the value is not an instruction in the expression with only one use, it
+ // cannot be demoted.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || !I->hasOneUse() || !Expr.count(I))
+ return false;
+
+ switch (I->getOpcode()) {
+
+ // We can always demote truncations and extensions. Since truncations can
+ // seed additional demotion, we save the truncated value.
+ case Instruction::Trunc:
+ Roots.push_back(I->getOperand(0));
+ break;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ break;
+
+ // We can demote certain binary operations if we can demote both of their
+ // operands.
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+ return false;
+ break;
+
+ // We can demote selects if we can demote their true and false values.
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+ return false;
+ break;
+ }
+
+ // We can demote phis if we can demote all their incoming operands. Note that
+ // we don't need to worry about cycles since we ensure single use above.
+ case Instruction::PHI: {
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+ return false;
+ break;
+ }
+
+ // Otherwise, conservatively give up.
+ default:
+ return false;
+ }
+
+ // Record the value that we can demote.
+ ToDemote.push_back(V);
+ return true;
+}
+
+void BoUpSLP::computeMinimumValueSizes() {
+ // If there are no external uses, the expression tree must be rooted by a
+ // store. We can't demote in-memory values, so there is nothing to do here.
+ if (ExternalUses.empty())
+ return;
+
+ // We only attempt to truncate integer expressions.
+ auto &TreeRoot = VectorizableTree[0]->Scalars;
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+ if (!TreeRootIT)
+ return;
+
+ // If the expression is not rooted by a store, these roots should have
+ // external uses. We will rely on InstCombine to rewrite the expression in
+ // the narrower type. However, InstCombine only rewrites single-use values.
+ // This means that if a tree entry other than a root is used externally, it
+ // must have multiple uses and InstCombine will not rewrite it. The code
+ // below ensures that only the roots are used externally.
+ SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+ for (auto &EU : ExternalUses)
+ if (!Expr.erase(EU.Scalar))
+ return;
+ if (!Expr.empty())
+ return;
+
+ // Collect the scalar values of the vectorizable expression. We will use this
+ // context to determine which values can be demoted. If we see a truncation,
+ // we mark it as seeding another demotion.
+ for (auto &EntryPtr : VectorizableTree)
+ Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
+
+ // Ensure the roots of the vectorizable tree don't form a cycle. They must
+ // have a single external user that is not in the vectorizable tree.
+ for (auto *Root : TreeRoot)
+ if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+ return;
+
+ // Conservatively determine if we can actually truncate the roots of the
+ // expression. Collect the values that can be demoted in ToDemote and
+ // additional roots that require investigating in Roots.
+ SmallVector<Value *, 32> ToDemote;
+ SmallVector<Value *, 4> Roots;
+ for (auto *Root : TreeRoot)
+ if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+ return;
+
+ // The maximum bit width required to represent all the values that can be
+ // demoted without loss of precision. It would be safe to truncate the roots
+ // of the expression to this width.
+ auto MaxBitWidth = 8u;
+
+ // We first check if all the bits of the roots are demanded. If they're not,
+ // we can truncate the roots to this narrower type.
+ for (auto *Root : TreeRoot) {
+ auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ MaxBitWidth = std::max<unsigned>(
+ Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+ }
+
+ // True if the roots can be zero-extended back to their original type, rather
+ // than sign-extended. We know that if the leading bits are not demanded, we
+ // can safely zero-extend. So we initialize IsKnownPositive to True.
+ bool IsKnownPositive = true;
+
+ // If all the bits of the roots are demanded, we can try a little harder to
+ // compute a narrower type. This can happen, for example, if the roots are
+ // getelementptr indices. InstCombine promotes these indices to the pointer
+ // width. Thus, all their bits are technically demanded even though the
+ // address computation might be vectorized in a smaller type.
+ //
+ // We start by looking at each entry that can be demoted. We compute the
+ // maximum bit width required to store the scalar by using ValueTracking to
+ // compute the number of high-order bits we can truncate.
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ llvm::all_of(TreeRoot, [](Value *R) {
+ assert(R->hasOneUse() && "Root should have only one use!");
+ return isa<GetElementPtrInst>(R->user_back());
+ })) {
+ MaxBitWidth = 8u;
+
+ // Determine if the sign bit of all the roots is known to be zero. If not,
+ // IsKnownPositive is set to False.
+ IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
+ });
+
+ // Determine the maximum number of bits required to store the scalar
+ // values.
+ for (auto *Scalar : ToDemote) {
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
+ auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+ MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
+ }
+
+ // If we can't prove that the sign bit is zero, we must add one to the
+ // maximum bit width to account for the unknown sign bit. This preserves
+ // the existing sign bit so we can safely sign-extend the root back to the
+ // original type. Otherwise, if we know the sign bit is zero, we will
+ // zero-extend the root instead.
+ //
+ // FIXME: This is somewhat suboptimal, as there will be cases where adding
+ // one to the maximum bit width will yield a larger-than-necessary
+ // type. In general, we need to add an extra bit only if we can't
+ // prove that the upper bit of the original type is equal to the
+ // upper bit of the proposed smaller type. If these two bits are the
+ // same (either zero or one) we know that sign-extending from the
+ // smaller type will result in the same value. Here, since we can't
+ // yet prove this, we are just making the proposed smaller type
+ // larger to ensure correctness.
+ if (!IsKnownPositive)
+ ++MaxBitWidth;
+ }
+
+ // Round MaxBitWidth up to the next power-of-two.
+ if (!isPowerOf2_64(MaxBitWidth))
+ MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+ // If the maximum bit width we compute is less than the with of the roots'
+ // type, we can proceed with the narrowing. Otherwise, do nothing.
+ if (MaxBitWidth >= TreeRootIT->getBitWidth())
+ return;
+
+ // If we can truncate the root, we must collect additional values that might
+ // be demoted as a result. That is, those seeded by truncations we will
+ // modify.
+ while (!Roots.empty())
+ collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+ // Finally, map the values we can demote to the maximum bit with we computed.
+ for (auto *Scalar : ToDemote)
+ MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+}
+
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+ SLPVectorizerPass Impl;
+
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit SLPVectorizer() : FunctionPass(ID) {
+ initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override {
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+ auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+ auto *AA = &AM.getResult<AAManager>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+ TargetTransformInfo *TTI_,
+ TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+ LoopInfo *LI_, DominatorTree *DT_,
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
+ SE = SE_;
+ TTI = TTI_;
+ TLI = TLI_;
+ AA = AA_;
+ LI = LI_;
+ DT = DT_;
+ AC = AC_;
+ DB = DB_;
+ DL = &F.getParent()->getDataLayout();
+
+ Stores.clear();
+ GEPs.clear();
+ bool Changed = false;
+
+ // If the target claims to have no vector registers don't attempt
+ // vectorization.
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
+ return false;
+
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+
+ // Use the bottom up slp vectorizer to construct chains that start with
+ // store instructions.
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
+
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
+
+ // Scan the blocks in the function in post order.
+ for (auto BB : post_order(&F.getEntryBlock())) {
+ collectSeedInstructions(BB);
+
+ // Vectorize trees that end at stores.
+ if (!Stores.empty()) {
+ LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeStoreChains(R);
+ }
+
+ // Vectorize trees that end at reductions.
+ Changed |= vectorizeChainsInBlock(BB, R);
+
+ // Vectorize the index computations of getelementptr instructions. This
+ // is primarily intended to catch gather-like idioms ending at
+ // non-consecutive loads.
+ if (!GEPs.empty()) {
+ LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeGEPIndices(BB, R);
+ }
+ }
+
+ if (Changed) {
+ R.optimizeGatherSequence();
+ LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ LLVM_DEBUG(verifyFunction(F));
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+ unsigned VecRegSize) {
+ const unsigned ChainLen = Chain.size();
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ const unsigned Sz = R.getVectorElementSize(Chain[0]);
+ const unsigned VF = VecRegSize / Sz;
+
+ if (!isPowerOf2_32(Sz) || VF < 2)
+ return false;
+
+ bool Changed = false;
+ // Look for profitable vectorizable trees at all offsets, starting at zero.
+ for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
+
+ ArrayRef<Value *> Operands = Chain.slice(i, VF);
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (llvm::any_of(Operands, [&R](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && R.isDeleted(I);
+ }))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
+
+ R.buildTree(Operands);
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
+ R.computeMinimumValueSizes();
+
+ int Cost = R.getTreeCost();
+
+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
+ << "\n");
+ if (Cost < -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+
+ using namespace ore;
+
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[i]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
+ R.vectorizeTree();
+
+ // Move to the next bundle.
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+ BoUpSLP &R) {
+ SetVector<StoreInst *> Heads;
+ SmallDenseSet<StoreInst *> Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ BoUpSLP::ValueSet VectorizedStores;
+ bool Changed = false;
+
+ auto &&FindConsecutiveAccess =
+ [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {
+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+ return false;
+
+ Tails.insert(Stores[Idx]);
+ Heads.insert(Stores[K]);
+ ConsecutiveChain[Stores[K]] = Stores[Idx];
+ return true;
+ };
+
+ // Do a quadratic search on all of the given stores in reverse order and find
+ // all of the pairs of stores that follow each other.
+ int E = Stores.size();
+ for (int Idx = E - 1; Idx >= 0; --Idx) {
+ // If a store has multiple consecutive store candidates, search according
+ // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)
+ if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
+ (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
+ break;
+ }
+
+ // For stores that start but don't end a link in the chain:
+ for (auto *SI : llvm::reverse(Heads)) {
+ if (Tails.count(SI))
+ continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to vectorize it.
+ BoUpSLP::ValueList Operands;
+ StoreInst *I = SI;
+ // Collect the chain into a list.
+ while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
+ Operands.push_back(I);
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
+ Size /= 2) {
+ if (vectorizeStoreChain(Operands, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
+ // Initialize the collections. We will make a single pass over the block.
+ Stores.clear();
+ GEPs.clear();
+
+ // Visit the store and getelementptr instructions in BB and organize them in
+ // Stores and GEPs according to the underlying objects of their pointer
+ // operands.
+ for (Instruction &I : *BB) {
+ // Ignore store instructions that are volatile or have a pointer operand
+ // that doesn't point to a scalar type.
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isSimple())
+ continue;
+ if (!isValidElementType(SI->getValueOperand()->getType()))
+ continue;
+ Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+ }
+
+ // Ignore getelementptr instructions that have more than one index, a
+ // constant index, or a pointer operand that doesn't point to a scalar
+ // type.
+ else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ auto Idx = GEP->idx_begin()->get();
+ if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+ continue;
+ if (!isValidElementType(Idx->getType()))
+ continue;
+ if (GEP->getType()->isVectorTy())
+ continue;
+ GEPs[GEP->getPointerOperand()].push_back(GEP);
+ }
+ }
+}
+
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+ if (!A || !B)
+ return false;
+ Value *VL[] = { A, B };
+ return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+}
+
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ int UserCost, bool AllowReorder) {
+ if (VL.size() < 2)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+ << VL.size() << ".\n");
+
+ // Check that all of the parts are scalar instructions of the same type,
+ // we permit an alternate opcode via InstructionsState.
+ InstructionsState S = getSameOpcode(VL);
+ if (!S.getOpcode())
+ return false;
+
+ Instruction *I0 = cast<Instruction>(S.OpValue);
+ unsigned Sz = R.getVectorElementSize(I0);
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
+
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
+ if (!isValidElementType(Ty)) {
+ // NOTE: the following will give user internal llvm type name, which may
+ // not be useful.
+ R.getORE()->emit([&]() {
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
+ });
+ return false;
+ }
+ }
+
+ bool Changed = false;
+ bool CandidateFound = false;
+ int MinCost = SLPCostThreshold;
+
+ unsigned NextInst = 0, MaxInst = VL.size();
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+ // No actual vectorization should happen, if number of parts is the same as
+ // provided vectorization factor (i.e. the scalar type is used for vector
+ // code during codegen).
+ auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+ if (TTI->getNumberOfParts(VecTy) == VF)
+ continue;
+ for (unsigned I = NextInst; I < MaxInst; ++I) {
+ unsigned OpsWidth = 0;
+
+ if (I + VF > MaxInst)
+ OpsWidth = MaxInst - I;
+ else
+ OpsWidth = VF;
+
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
+
+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (llvm::any_of(Ops, [&R](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && R.isDeleted(I);
+ }))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
+
+ R.buildTree(Ops);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+ // TODO: check if we can allow reordering for more cases.
+ if (AllowReorder && Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ // Conceptually, there is nothing actually preventing us from trying to
+ // reorder a larger list. In fact, we do exactly this when vectorizing
+ // reductions. However, at this point, we only expect to get here when
+ // there are exactly two operations.
+ assert(Ops.size() == 2);
+ Value *ReorderedOps[] = {Ops[1], Ops[0]};
+ R.buildTree(ReorderedOps, None);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
+ R.computeMinimumValueSizes();
+ int Cost = R.getTreeCost() - UserCost;
+ CandidateFound = true;
+ MinCost = std::min(MinCost, Cost);
+
+ if (Cost < -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
+ R.vectorizeTree();
+ // Move to the next bundle.
+ I += VF - 1;
+ NextInst = I + 1;
+ Changed = true;
+ }
+ }
+ }
+
+ if (!Changed && CandidateFound) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
+ });
+ } else if (!Changed) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
+ });
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ return false;
+
+ Value *P = I->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+ return false;
+
+ // Try to vectorize V.
+ if (tryToVectorizePair(Op0, Op1, R))
+ return true;
+
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
+ // Try to skip B.
+ if (B && B->hasOneUse()) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+ return true;
+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+ return true;
+ }
+
+ // Try to skip A.
+ if (A && A->hasOneUse()) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+ return true;
+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+ return true;
+ }
+ return false;
+}
+
+/// Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+/// vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+/// reduction. A pairwise reduction will generate a mask of
+/// <0,2,...> or <1,3,..> while a splitting reduction will generate
+/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+ bool IsPairwise, bool IsLeft,
+ IRBuilder<> &Builder) {
+ assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+ SmallVector<Constant *, 32> ShuffleMask(
+ VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+ if (IsPairwise)
+ // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+ for (unsigned i = 0; i != NumEltsToRdx; ++i)
+ ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+ else
+ // Move the upper half of the vector to the lower half.
+ for (unsigned i = 0; i != NumEltsToRdx; ++i)
+ ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+ return ConstantVector::get(ShuffleMask);
+}
+
+namespace {
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+/// \ / \ /
+/// + +
+/// \ /
+/// +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+/// ...
+/// \ /
+/// +
+/// |
+/// phi +=
+///
+/// Or:
+/// ...
+/// \ /
+/// +
+/// |
+/// *p =
+///
+class HorizontalReduction {
+ using ReductionOpsType = SmallVector<Value *, 16>;
+ using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
+ ReductionOpsListType ReductionOps;
+ SmallVector<Value *, 32> ReducedVals;
+ // Use map vector to make stable output.
+ MapVector<Instruction *, Value *> ExtraArgs;
+
+ /// Kind of the reduction data.
+ enum ReductionKind {
+ RK_None, /// Not a reduction.
+ RK_Arithmetic, /// Binary reduction data.
+ RK_Min, /// Minimum reduction data.
+ RK_UMin, /// Unsigned minimum reduction data.
+ RK_Max, /// Maximum reduction data.
+ RK_UMax, /// Unsigned maximum reduction data.
+ };
+
+ /// Contains info about operation, like its opcode, left and right operands.
+ class OperationData {
+ /// Opcode of the instruction.
+ unsigned Opcode = 0;
+
+ /// Left operand of the reduction operation.
+ Value *LHS = nullptr;
+
+ /// Right operand of the reduction operation.
+ Value *RHS = nullptr;
+
+ /// Kind of the reduction operation.
+ ReductionKind Kind = RK_None;
+
+ /// True if float point min/max reduction has no NaNs.
+ bool NoNaN = false;
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable() const {
+ return LHS && RHS &&
+ // We currently only support add/mul/logical && min/max reductions.
+ ((Kind == RK_Arithmetic &&
+ (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
+ Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
+ Opcode == Instruction::And || Opcode == Instruction::Or ||
+ Opcode == Instruction::Xor)) ||
+ ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ (Kind == RK_Min || Kind == RK_Max)) ||
+ (Opcode == Instruction::ICmp &&
+ (Kind == RK_UMin || Kind == RK_UMax)));
+ }
+
+ /// Creates reduction operation with the current opcode.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ Value *Cmp = nullptr;
+ switch (Kind) {
+ case RK_Arithmetic:
+ return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+ Name);
+ case RK_Min:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
+ : Builder.CreateFCmpOLT(LHS, RHS);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ case RK_Max:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
+ : Builder.CreateFCmpOGT(LHS, RHS);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ case RK_UMin:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpULT(LHS, RHS);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpUGT(LHS, RHS);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+
+ public:
+ explicit OperationData() = default;
+
+ /// Construction for reduced values. They are identified by opcode only and
+ /// don't have associated LHS/RHS values.
+ explicit OperationData(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ Opcode = I->getOpcode();
+ }
+
+ /// Constructor for reduction operations with opcode and its left and
+ /// right operands.
+ OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
+ bool NoNaN = false)
+ : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
+ assert(Kind != RK_None && "One of the reduction operations is expected.");
+ }
+
+ explicit operator bool() const { return Opcode; }
+
+ /// Get the index of the first operand.
+ unsigned getFirstOperandIndex() const {
+ assert(!!*this && "The opcode is not set.");
+ switch (Kind) {
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 1;
+ case RK_Arithmetic:
+ case RK_None:
+ break;
+ }
+ return 0;
+ }
+
+ /// Total number of operands in the reduction operation.
+ unsigned getNumberOfOperands() const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return 2;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 3;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the operation has the same parent as \p P.
+ bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ if (!IsRedOp)
+ return I->getParent() == P;
+ switch (Kind) {
+ case RK_Arithmetic:
+ // Arithmetic reduction operation must be used once only.
+ return I->getParent() == P;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax: {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+ return I->getParent() == P && Cmp && Cmp->getParent() == P;
+ }
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+ /// Expected number of uses for reduction operations/reduced values.
+ bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->hasOneUse();
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return I->hasNUses(2) &&
+ (!IsReductionOp ||
+ cast<SelectInst>(I)->getCondition()->hasOneUse());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Initializes the list of reduction operations.
+ void initReductionOps(ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps.assign(1, ReductionOpsType());
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps.assign(2, ReductionOpsType());
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+ /// Add all reduction operations for the reduction instruction \p I.
+ void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps[0].emplace_back(I);
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+ ReductionOps[1].emplace_back(I);
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+
+ /// Checks if instruction is associative and can be vectorized.
+ bool isAssociative(Instruction *I) const {
+ assert(Kind != RK_None && *this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->isAssociative();
+ case RK_Min:
+ case RK_Max:
+ return Opcode == Instruction::ICmp ||
+ cast<Instruction>(I->getOperand(0))->isFast();
+ case RK_UMin:
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp &&
+ "Only integer compare operation is expected.");
+ return true;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable(Instruction *I) const {
+ return isVectorizable() && isAssociative(I);
+ }
+
+ /// Checks if two operation data are both a reduction op or both a reduced
+ /// value.
+ bool operator==(const OperationData &OD) {
+ assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
+ "One of the comparing operations is incorrect.");
+ return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
+ }
+ bool operator!=(const OperationData &OD) { return !(*this == OD); }
+ void clear() {
+ Opcode = 0;
+ LHS = nullptr;
+ RHS = nullptr;
+ Kind = RK_None;
+ NoNaN = false;
+ }
+
+ /// Get the opcode of the reduction operation.
+ unsigned getOpcode() const {
+ assert(isVectorizable() && "Expected vectorizable operation.");
+ return Opcode;
+ }
+
+ /// Get kind of reduction data.
+ ReductionKind getKind() const { return Kind; }
+ Value *getLHS() const { return LHS; }
+ Value *getRHS() const { return RHS; }
+ Type *getConditionType() const {
+ switch (Kind) {
+ case RK_Arithmetic:
+ return nullptr;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ return CmpInst::makeCmpResultType(LHS->getType());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p ReductionOps.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ const ReductionOpsListType &ReductionOps) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, ReductionOps[0]);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(SI->getCondition(), ReductionOps[0]);
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p I.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ Instruction *I) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op)) {
+ propagateIRFlags(SI->getCondition(),
+ cast<SelectInst>(I)->getCondition());
+ }
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+
+ TargetTransformInfo::ReductionFlags getFlags() const {
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ switch (Kind) {
+ case RK_Arithmetic:
+ break;
+ case RK_Min:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_Max:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_UMin:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_UMax:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ return Flags;
+ }
+ };
+
+ WeakTrackingVH ReductionRoot;
+
+ /// The operation data of the reduction operation.
+ OperationData ReductionData;
+
+ /// The operation data of the values we perform a reduction on.
+ OperationData ReducedValueData;
+
+ /// Should we model this reduction as a pairwise reduction tree or a tree that
+ /// splits the vector in halves and adds those halves.
+ bool IsPairwiseReduction = false;
+
+ /// Checks if the ParentStackElem.first should be marked as a reduction
+ /// operation with an extra argument or as extra argument itself.
+ void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+ Value *ExtraArg) {
+ if (ExtraArgs.count(ParentStackElem.first)) {
+ ExtraArgs[ParentStackElem.first] = nullptr;
+ // We ran into something like:
+ // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+ // The whole ParentStackElem.first should be considered as an extra value
+ // in this case.
+ // Do not perform analysis of remaining operands of ParentStackElem.first
+ // instruction, this whole instruction is an extra argument.
+ ParentStackElem.second = ParentStackElem.first->getNumOperands();
+ } else {
+ // We ran into something like:
+ // ParentStackElem.first += ... + ExtraArg + ...
+ ExtraArgs[ParentStackElem.first] = ExtraArg;
+ }
+ }
+
+ static OperationData getOperationData(Value *V) {
+ if (!V)
+ return OperationData();
+
+ Value *LHS;
+ Value *RHS;
+ if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
+ return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
+ RK_Arithmetic);
+ }
+ if (auto *Select = dyn_cast<SelectInst>(V)) {
+ // Look for a min/max pattern.
+ if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+ } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+ } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+ } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+ } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ LHS = Select->getTrueValue();
+ RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return OperationData(V);
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return OperationData(V);
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ }
+ switch (Pred) {
+ default:
+ return OperationData(V);
+
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Cond)->hasNoNaNs());
+
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Cond)->hasNoNaNs());
+ }
+ }
+ }
+ return OperationData(V);
+ }
+
+public:
+ HorizontalReduction() = default;
+
+ /// Try to find a reduction tree.
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
+ assert((!Phi || is_contained(Phi->operands(), B)) &&
+ "Thi phi needs to use the binary operator");
+
+ ReductionData = getOperationData(B);
+
+ // We could have a initial reductions that is not an add.
+ // r *= v1 + v2 + v3 + v4
+ // In such a case start looking for a tree rooted in the first '+'.
+ if (Phi) {
+ if (ReductionData.getLHS() == Phi) {
+ Phi = nullptr;
+ B = dyn_cast<Instruction>(ReductionData.getRHS());
+ ReductionData = getOperationData(B);
+ } else if (ReductionData.getRHS() == Phi) {
+ Phi = nullptr;
+ B = dyn_cast<Instruction>(ReductionData.getLHS());
+ ReductionData = getOperationData(B);
+ }
+ }
+
+ if (!ReductionData.isVectorizable(B))
+ return false;
+
+ Type *Ty = B->getType();
+ if (!isValidElementType(Ty))
+ return false;
+ if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ return false;
+
+ ReducedValueData.clear();
+ ReductionRoot = B;
+
+ // Post order traverse the reduction tree starting at B. We only handle true
+ // trees containing only binary operators.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
+ Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
+ ReductionData.initReductionOps(ReductionOps);
+ while (!Stack.empty()) {
+ Instruction *TreeN = Stack.back().first;
+ unsigned EdgeToVist = Stack.back().second++;
+ OperationData OpData = getOperationData(TreeN);
+ bool IsReducedValue = OpData != ReductionData;
+
+ // Postorder vist.
+ if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
+ if (IsReducedValue)
+ ReducedVals.push_back(TreeN);
+ else {
+ auto I = ExtraArgs.find(TreeN);
+ if (I != ExtraArgs.end() && !I->second) {
+ // Check if TreeN is an extra argument of its parent operation.
+ if (Stack.size() <= 1) {
+ // TreeN can't be an extra argument as it is a root reduction
+ // operation.
+ return false;
+ }
+ // Yes, TreeN is an extra argument, do not add it to a list of
+ // reduction operations.
+ // Stack[Stack.size() - 2] always points to the parent operation.
+ markExtraArg(Stack[Stack.size() - 2], TreeN);
+ ExtraArgs.erase(TreeN);
+ } else
+ ReductionData.addReductionOps(TreeN, ReductionOps);
+ }
+ // Retract.
+ Stack.pop_back();
+ continue;
+ }
+
+ // Visit left or right.
+ Value *NextV = TreeN->getOperand(EdgeToVist);
+ if (NextV != Phi) {
+ auto *I = dyn_cast<Instruction>(NextV);
+ OpData = getOperationData(I);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a reduced value. If the reduced value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // reduced value class.
+ if (I && (!ReducedValueData || OpData == ReducedValueData ||
+ OpData == ReductionData)) {
+ const bool IsReductionOperation = OpData == ReductionData;
+ // Only handle trees in the current basic block.
+ if (!ReductionData.hasSameParent(I, B->getParent(),
+ IsReductionOperation)) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ if (!ReductionData.hasRequiredNumberOfUses(I,
+ OpData == ReductionData) &&
+ I != B) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+
+ if (IsReductionOperation) {
+ // We need to be able to reassociate the reduction operations.
+ if (!OpData.isAssociative(I)) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+ } else if (ReducedValueData &&
+ ReducedValueData != OpData) {
+ // Make sure that the opcodes of the operations that we are going to
+ // reduce match.
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ } else if (!ReducedValueData)
+ ReducedValueData = OpData;
+
+ Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
+ continue;
+ }
+ }
+ // NextV is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), NextV);
+ }
+ return true;
+ }
+
+ /// Attempt to vectorize the tree found by
+ /// matchAssociativeReduction.
+ bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ if (ReducedVals.empty())
+ return false;
+
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ unsigned NumReducedVals = ReducedVals.size();
+ if (NumReducedVals < 4)
+ return false;
+
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+
+ Value *VectorizedTree = nullptr;
+
+ // FIXME: Fast-math-flags should be set based on the instructions in the
+ // reduction (not all of 'fast' are required).
+ IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
+ FastMathFlags Unsafe;
+ Unsafe.setFast();
+ Builder.setFastMathFlags(Unsafe);
+ unsigned i = 0;
+
+ BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+ // The same extra argument may be used several time, so log each attempt
+ // to use it.
+ for (auto &Pair : ExtraArgs) {
+ assert(Pair.first && "DebugLoc must be set.");
+ ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ }
+ // The reduction root is used as the insertion point for new instructions,
+ // so set it as externally used to prevent it from being deleted.
+ ExternallyUsedValues[ReductionRoot];
+ SmallVector<Value *, 16> IgnoreList;
+ for (auto &V : ReductionOps)
+ IgnoreList.append(V.begin(), V.end());
+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+ auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+ V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+ Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == VL.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(VL.size());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
+ V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+ }
+ if (V.isTreeTinyAndNotFullyVectorizable())
+ break;
+ if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+ break;
+
+ V.computeMinimumValueSizes();
+
+ // Estimate cost.
+ int TreeCost = V.getTreeCost();
+ int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int Cost = TreeCost + ReductionCost;
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost "
+ << ore::NV("Cost", Cost) << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
+ V.getORE()->emit([&]() {
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
+ });
+
+ // Vectorize a tree.
+ DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+ Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+
+ // Emit a reduction.
+ Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ if (VectorizedTree) {
+ Builder.SetCurrentDebugLocation(Loc);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, ReducedSubTree,
+ ReductionData.getKind());
+ VectorizedTree =
+ VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
+ } else
+ VectorizedTree = ReducedSubTree;
+ i += ReduxWidth;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - i);
+ }
+
+ if (VectorizedTree) {
+ // Finish the reduction.
+ for (; i < NumReducedVals; ++i) {
+ auto *I = cast<Instruction>(ReducedVals[i]);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, I,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
+ }
+ for (auto &Pair : ExternallyUsedValues) {
+ // Add each externally used value to the final reduction.
+ for (auto *I : Pair.second) {
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, Pair.first,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
+ }
+ }
+ // Update users.
+ ReductionRoot->replaceAllUsesWith(VectorizedTree);
+ // Mark all scalar reduction ops for deletion, they are replaced by the
+ // vector reductions.
+ V.eraseInstructions(IgnoreList);
+ }
+ return VectorizedTree != nullptr;
+ }
+
+ unsigned numReductionValues() const {
+ return ReducedVals.size();
+ }
+
+private:
+ /// Calculate the cost of a reduction.
+ int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
+ unsigned ReduxWidth) {
+ Type *ScalarTy = FirstReducedVal->getType();
+ Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+ int PairwiseRdxCost;
+ int SplittingRdxCost;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ PairwiseRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/true);
+ SplittingRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/false);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax: {
+ Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+ bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
+ ReductionData.getKind() == RK_UMax;
+ PairwiseRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/true, IsUnsigned);
+ SplittingRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/false, IsUnsigned);
+ break;
+ }
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
+
+ IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+ int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+ int ScalarReduxCost = 0;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ ScalarReduxCost =
+ TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ ScalarReduxCost =
+ TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ break;
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
+ ScalarReduxCost *= (ReduxWidth - 1);
+
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ << " for reduction that starts with " << *FirstReducedVal
+ << " (It is a "
+ << (IsPairwiseReduction ? "pairwise" : "splitting")
+ << " reduction)\n");
+
+ return VecReduxCost - ScalarReduxCost;
+ }
+
+ /// Emit a horizontal reduction of the vectorized value.
+ Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+ unsigned ReduxWidth, const TargetTransformInfo *TTI) {
+ assert(VectorizedValue && "Need to have a vectorized tree node");
+ assert(isPowerOf2_32(ReduxWidth) &&
+ "We only handle power-of-two reductions for now");
+
+ if (!IsPairwiseReduction) {
+ // FIXME: The builder should use an FMF guard. It should not be hard-coded
+ // to 'fast'.
+ assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
+ ReductionData.getFlags(), ReductionOps.back());
+ }
+
+ Value *TmpVec = VectorizedValue;
+ for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+ Value *LeftMask =
+ createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+ Value *RightMask =
+ createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+ Value *LeftShuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+ Value *RightShuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+ "rdx.shuf.r");
+ OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
+ RightShuf, ReductionData.getKind());
+ TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
+ }
+
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+ }
+};
+
+} // end anonymous namespace
+
+/// Recognize construction of vectors like
+/// %ra = insertelement <4 x float> undef, float %s0, i32 0
+/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
+/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
+/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
+/// starting from the last insertelement instruction.
+///
+/// Returns true if it matches
+static bool findBuildVector(InsertElementInst *LastInsertElem,
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ int &UserCost) {
+ UserCost = 0;
+ Value *V = nullptr;
+ do {
+ if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+ LastInsertElem->getType(),
+ CI->getZExtValue());
+ }
+ BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
+ V = LastInsertElem->getOperand(0);
+ if (isa<UndefValue>(V))
+ break;
+ LastInsertElem = dyn_cast<InsertElementInst>(V);
+ if (!LastInsertElem || !LastInsertElem->hasOneUse())
+ return false;
+ } while (true);
+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ return true;
+}
+
+/// Like findBuildVector, but looks for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+ SmallVectorImpl<Value *> &BuildVectorOpds) {
+ do {
+ BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+ Value *V = IV->getAggregateOperand();
+ if (isa<UndefValue>(V))
+ break;
+ IV = dyn_cast<InsertValueInst>(V);
+ if (!IV || !IV->hasOneUse())
+ return false;
+ } while (true);
+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ return true;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+ return V->getType() < V2->getType();
+}
+
+/// Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
+ // There are situations where the reduction value is not dominated by the
+ // reduction phi. Vectorizing such cases has been reported to cause
+ // miscompiles. See PR25787.
+ auto DominatedReduxValue = [&](Value *R) {
+ return isa<Instruction>(R) &&
+ DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
+ };
+
+ Value *Rdx = nullptr;
+
+ // Return the incoming value if it comes from the same BB as the phi node.
+ if (P->getIncomingBlock(0) == ParentBB) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == ParentBB) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ // Otherwise, check whether we have a loop latch to look at.
+ Loop *BBL = LI->getLoopFor(ParentBB);
+ if (!BBL)
+ return nullptr;
+ BasicBlock *BBLatch = BBL->getLoopLatch();
+ if (!BBLatch)
+ return nullptr;
+
+ // There is a loop latch, return the incoming value if it comes from
+ // that. This reduction pattern occasionally turns up.
+ if (P->getIncomingBlock(0) == BBLatch) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == BBLatch) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ return nullptr;
+}
+
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI,
+ const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
+ if (!ShouldVectorizeHor)
+ return false;
+
+ if (!Root)
+ return false;
+
+ if (Root->getParent() != BB || isa<PHINode>(Root))
+ return false;
+ // Start analysis starting from Root instruction. If horizontal reduction is
+ // found, try to vectorize it. If it is not a horizontal reduction or
+ // vectorization is not possible or not effective, and currently analyzed
+ // instruction is a binary operation, try to vectorize the operands, using
+ // pre-order DFS traversal order. If the operands were not vectorized, repeat
+ // the same procedure considering each operand as a possible root of the
+ // horizontal reduction.
+ // Interrupt the process if the Root instruction itself was vectorized or all
+ // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+ SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
+ SmallPtrSet<Value *, 8> VisitedInstrs;
+ bool Res = false;
+ while (!Stack.empty()) {
+ Instruction *Inst;
+ unsigned Level;
+ std::tie(Inst, Level) = Stack.pop_back_val();
+ auto *BI = dyn_cast<BinaryOperator>(Inst);
+ auto *SI = dyn_cast<SelectInst>(Inst);
+ if (BI || SI) {
+ HorizontalReduction HorRdx;
+ if (HorRdx.matchAssociativeReduction(P, Inst)) {
+ if (HorRdx.tryToReduce(R, TTI)) {
+ Res = true;
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ continue;
+ }
+ }
+ if (P && BI) {
+ Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (Inst == P)
+ Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ if (!Inst) {
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ continue;
+ }
+ }
+ }
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ if (Vectorize(Inst, R)) {
+ Res = true;
+ continue;
+ }
+
+ // Try to vectorize operands.
+ // Continue analysis for the instruction from the same basic block only to
+ // save compile time.
+ if (++Level < RecursionMaxDepth)
+ for (auto *Op : Inst->operand_values())
+ if (VisitedInstrs.insert(Op).second)
+ if (auto *I = dyn_cast<Instruction>(Op))
+ if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+ Stack.emplace_back(I, Level);
+ }
+ return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+ BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI) {
+ if (!V)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I))
+ P = nullptr;
+ // Try to match and vectorize a horizontal reduction.
+ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
+ return tryToVectorize(I, R);
+ };
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ ExtraVectorization);
+}
+
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+ BasicBlock *BB, BoUpSLP &R) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (!R.canMapToVector(IVI->getType(), DL))
+ return false;
+
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(IVI, BuildVectorOpds))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ // Aggregate value is unlikely to be processed in vector register, we need to
+ // extract scalars into scalar registers, so NeedExtraction is set true.
+ return tryToVectorizeList(BuildVectorOpds, R);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+ BasicBlock *BB, BoUpSLP &R) {
+ int UserCost;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+ (llvm::all_of(BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ isShuffle(BuildVectorOpds)))
+ return false;
+
+ // Vectorize starting with the build vector operands ignoring the BuildVector
+ // instructions for the purpose of scheduling and user extraction.
+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+ BoUpSLP &R) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+ return true;
+
+ bool OpsChanged = false;
+ for (int Idx = 0; Idx < 2; ++Idx) {
+ OpsChanged |=
+ vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+ }
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ bool OpsChanged = false;
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I))
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (auto *CI = dyn_cast<CmpInst>(I))
+ OpsChanged |= vectorizeCmpInst(CI, BB, R);
+ }
+ Instructions.clear();
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ SmallVector<Value *, 4> Incoming;
+ SmallPtrSet<Value *, 16> VisitedInstrs;
+
+ bool HaveVectorizedPhiNodes = true;
+ while (HaveVectorizedPhiNodes) {
+ HaveVectorizedPhiNodes = false;
+
+ // Collect the incoming values from the PHIs.
+ Incoming.clear();
+ for (Instruction &I : *BB) {
+ PHINode *P = dyn_cast<PHINode>(&I);
+ if (!P)
+ break;
+
+ if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+ Incoming.push_back(P);
+ }
+
+ // Sort by type.
+ llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+
+ // Try to vectorize elements base on their type.
+ for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+ E = Incoming.end();
+ IncIt != E;) {
+
+ // Look for the next elements with the same type.
+ SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ while (SameTypeIt != E &&
+ (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ VisitedInstrs.insert(*SameTypeIt);
+ ++SameTypeIt;
+ }
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+ << NumElts << ")\n");
+ // The order in which the phi nodes appear in the program does not matter.
+ // So allow tryToVectorizeList to reorder them if it is beneficial. This
+ // is done when there are exactly two elements since tryToVectorizeList
+ // asserts that there are only two values when AllowReorder is true.
+ bool AllowReorder = NumElts == 2;
+ if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+ /*UserCost=*/0, AllowReorder)) {
+ // Success start over because instructions might have been changed.
+ HaveVectorizedPhiNodes = true;
+ Changed = true;
+ break;
+ }
+
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
+ }
+ }
+
+ VisitedInstrs.clear();
+
+ SmallVector<Instruction *, 8> PostProcessInstructions;
+ SmallDenseSet<Instruction *, 4> KeyNodes;
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip instructions marked for the deletion.
+ if (R.isDeleted(&*it))
+ continue;
+ // We may go through BB multiple times so skip the one we have checked.
+ if (!VisitedInstrs.insert(&*it).second) {
+ if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+ vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
+ continue;
+ }
+
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
+ // Try to vectorize reductions that use PHINodes.
+ if (PHINode *P = dyn_cast<PHINode>(it)) {
+ // Check that the PHI is a reduction PHI.
+ if (P->getNumIncomingValues() != 2)
+ return Changed;
+
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ continue;
+ }
+
+ // Ran into an instruction without users, like terminator, or function call
+ // with ignored return value, store. Ignore unused instructions (basing on
+ // instruction type, except for CallInst and InvokeInst).
+ if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+ isa<InvokeInst>(it))) {
+ KeyNodes.insert(&*it);
+ bool OpsChanged = false;
+ if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+ for (auto *V : it->operand_values()) {
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ }
+ }
+ // Start vectorization of post-process list of instructions from the
+ // top-tree instructions to try to vectorize as many instructions as
+ // possible.
+ OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+ if (OpsChanged) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
+ if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+ isa<InsertValueInst>(it))
+ PostProcessInstructions.push_back(&*it);
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+ auto Changed = false;
+ for (auto &Entry : GEPs) {
+ // If the getelementptr list has fewer than two elements, there's nothing
+ // to do.
+ if (Entry.second.size() < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
+
+ // Process the GEP list in chunks suitable for the target's supported
+ // vector size. If a vector register can't hold 1 element, we are done.
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+ if (MaxVecRegSize < EltSize)
+ continue;
+
+ unsigned MaxElts = MaxVecRegSize / EltSize;
+ for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+ auto Len = std::min<unsigned>(BE - BI, MaxElts);
+ auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+
+ // Initialize a set a candidate getelementptrs. Note that we use a
+ // SetVector here to preserve program order. If the index computations
+ // are vectorizable and begin with loads, we want to minimize the chance
+ // of having to reorder them later.
+ SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+ // Some of the candidates may have already been vectorized after we
+ // initially collected them. If so, they are marked as deleted, so remove
+ // them from the set of candidates.
+ Candidates.remove_if(
+ [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
+
+ // Remove from the set of candidates all pairs of getelementptrs with
+ // constant differences. Such getelementptrs are likely not good
+ // candidates for vectorization in a bottom-up phase since one can be
+ // computed from the other. We also ensure all candidate getelementptr
+ // indices are unique.
+ for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+ auto *GEPI = GEPList[I];
+ if (!Candidates.count(GEPI))
+ continue;
+ auto *SCEVI = SE->getSCEV(GEPList[I]);
+ for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+ auto *GEPJ = GEPList[J];
+ auto *SCEVJ = SE->getSCEV(GEPList[J]);
+ if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+ Candidates.remove(GEPI);
+ Candidates.remove(GEPJ);
+ } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+ Candidates.remove(GEPJ);
+ }
+ }
+ }
+
+ // We break out of the above computation as soon as we know there are
+ // fewer than two candidates remaining.
+ if (Candidates.size() < 2)
+ continue;
+
+ // Add the single, non-constant index of each candidate to the bundle. We
+ // ensured the indices met these constraints when we originally collected
+ // the getelementptrs.
+ SmallVector<Value *, 16> Bundle(Candidates.size());
+ auto BundleIndex = 0u;
+ for (auto *V : Candidates) {
+ auto *GEP = cast<GetElementPtrInst>(V);
+ auto *GEPIdx = GEP->idx_begin()->get();
+ assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+ Bundle[BundleIndex++] = GEPIdx;
+ }
+
+ // Try and vectorize the indices. We are currently only interested in
+ // gather-like cases of the form:
+ //
+ // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+ //
+ // where the loads of "a", the loads of "b", and the subtractions can be
+ // performed in parallel. It's likely that detecting this pattern in a
+ // bottom-up phase will be simpler and less costly than building a
+ // full-blown top-down phase beginning at the consecutive loads.
+ Changed |= tryToVectorizeList(Bundle, R);
+ }
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
+ bool Changed = false;
+ // Attempt to sort and vectorize each of the store-groups.
+ for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+ ++it) {
+ if (it->second.size() < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
+
+ // Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
+ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
+ unsigned Len = std::min<unsigned>(CE - CI, 16);
+ Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
+ }
+ }
+ return Changed;
+}
+
+char SLPVectorizer::ID = 0;
+
+static const char lv_name[] = "SLP Vectorizer";
+
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
new file mode 100644
index 000000000000..0ca6a6b93cfd
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -0,0 +1,126 @@
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetTransformInfo;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ VPBuilder &Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+public:
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan);
+
+ /// Check if \I is a memory instruction to be widened for \p Range.Start and
+ /// potentially masked. Such instructions are handled by a recipe that takes
+ /// an additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *
+ tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+ /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+ /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+ /// extended to include \p I or else build a new VPWidenRecipe for it and
+ /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+ /// false otherwise. Range.End may be decreased to ensure same decision from
+ /// \p Range.Start to \p Range.End.
+ bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+public:
+ VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM, VPBuilder &Builder)
+ : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
+
+ /// Check if a recipe can be create for \p I withing the given VF \p Range.
+ /// If a recipe can be created, it adds it to \p VPBB.
+ bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
+ VPBasicBlock *VPBB);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
new file mode 100644
index 000000000000..4b80d1fb20aa
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -0,0 +1,766 @@
+//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the LLVM vectorization plan. It represents a candidate for
+/// vectorization, allowing to plan and optimize how to vectorize a given loop
+/// before generating LLVM-IR.
+/// The vectorizer uses vectorization plans to estimate the costs of potential
+/// candidates and if profitable to execute the desired plan, generating vector
+/// LLVM-IR code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+extern cl::opt<bool> EnableVPlanNativePath;
+
+#define DEBUG_TYPE "vplan"
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
+ if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V))
+ Instr->print(OS);
+ else
+ V.printAsOperand(OS);
+ return OS;
+}
+
+/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
+ if (!Successors.empty() || !Parent)
+ return this;
+ assert(Parent->getExit() == this &&
+ "Block w/o successors not the exit of its parent.");
+ return Parent->getEnclosingBlockWithSuccessors();
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
+ if (!Predecessors.empty() || !Parent)
+ return this;
+ assert(Parent->getEntry() == this &&
+ "Block w/o predecessors not the entry of its parent.");
+ return Parent->getEnclosingBlockWithPredecessors();
+}
+
+void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
+ SmallVector<VPBlockBase *, 8> Blocks;
+ for (VPBlockBase *Block : depth_first(Entry))
+ Blocks.push_back(Block);
+
+ for (VPBlockBase *Block : Blocks)
+ delete Block;
+}
+
+BasicBlock *
+VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
+ // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
+ // Pred stands for Predessor. Prev stands for Previous - last visited/created.
+ BasicBlock *PrevBB = CFG.PrevBB;
+ BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
+ PrevBB->getParent(), CFG.LastBB);
+ LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+
+ // Hook up the new basic block to its predecessors.
+ for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+ VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
+ auto &PredVPSuccessors = PredVPBB->getSuccessors();
+ BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+
+ // In outer loop vectorization scenario, the predecessor BBlock may not yet
+ // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
+ // vectorization. We do not encounter this case in inner loop vectorization
+ // as we start out by building a loop skeleton with the vector loop header
+ // and latch blocks. As a result, we never enter this function for the
+ // header block in the non VPlan-native path.
+ if (!PredBB) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected null predecessor in non VPlan-native path");
+ CFG.VPBBsToFix.push_back(PredVPBB);
+ continue;
+ }
+
+ assert(PredBB && "Predecessor basic-block not found building successor.");
+ auto *PredBBTerminator = PredBB->getTerminator();
+ LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+ if (isa<UnreachableInst>(PredBBTerminator)) {
+ assert(PredVPSuccessors.size() == 1 &&
+ "Predecessor ending w/o branch must have single successor.");
+ PredBBTerminator->eraseFromParent();
+ BranchInst::Create(NewBB, PredBB);
+ } else {
+ assert(PredVPSuccessors.size() == 2 &&
+ "Predecessor ending with branch must have two successors.");
+ unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ assert(!PredBBTerminator->getSuccessor(idx) &&
+ "Trying to reset an existing successor block.");
+ PredBBTerminator->setSuccessor(idx, NewBB);
+ }
+ }
+ return NewBB;
+}
+
+void VPBasicBlock::execute(VPTransformState *State) {
+ bool Replica = State->Instance &&
+ !(State->Instance->Part == 0 && State->Instance->Lane == 0);
+ VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
+ VPBlockBase *SingleHPred = nullptr;
+ BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
+
+ // 1. Create an IR basic block, or reuse the last one if possible.
+ // The last IR basic block is reused, as an optimization, in three cases:
+ // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
+ // B. when the current VPBB has a single (hierarchical) predecessor which
+ // is PrevVPBB and the latter has a single (hierarchical) successor; and
+ // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+ // is the exit of this region from a previous instance, or the predecessor
+ // of this region.
+ if (PrevVPBB && /* A */
+ !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+ SingleHPred->getExitBasicBlock() == PrevVPBB &&
+ PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
+ !(Replica && getPredecessors().empty())) { /* C */
+ NewBB = createEmptyBasicBlock(State->CFG);
+ State->Builder.SetInsertPoint(NewBB);
+ // Temporarily terminate with unreachable until CFG is rewired.
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+ // Register NewBB in its loop. In innermost loops its the same for all BB's.
+ Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
+ L->addBasicBlockToLoop(NewBB, *State->LI);
+ State->CFG.PrevBB = NewBB;
+ }
+
+ // 2. Fill the IR basic block with IR instructions.
+ LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << NewBB->getName() << '\n');
+
+ State->CFG.VPBB2IRBB[this] = NewBB;
+ State->CFG.PrevVPBB = this;
+
+ for (VPRecipeBase &Recipe : Recipes)
+ Recipe.execute(*State);
+
+ VPValue *CBV;
+ if (EnableVPlanNativePath && (CBV = getCondBit())) {
+ Value *IRCBV = CBV->getUnderlyingValue();
+ assert(IRCBV && "Unexpected null underlying value for condition bit");
+
+ // Condition bit value in a VPBasicBlock is used as the branch selector. In
+ // the VPlan-native path case, since all branches are uniform we generate a
+ // branch instruction using the condition value from vector lane 0 and dummy
+ // successors. The successors are fixed later when the successor blocks are
+ // visited.
+ Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
+ NewCond = State->Builder.CreateExtractElement(NewCond,
+ State->Builder.getInt32(0));
+
+ // Replace the temporary unreachable terminator with the new conditional
+ // branch.
+ auto *CurrentTerminator = NewBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional "
+ "branch.");
+ auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+}
+
+void VPRegionBlock::execute(VPTransformState *State) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
+
+ if (!isReplicator()) {
+ // Visit the VPBlocks connected to "this", starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ if (EnableVPlanNativePath) {
+ // The inner loop vectorization path does not represent loop preheader
+ // and exit blocks as part of the VPlan. In the VPlan-native path, skip
+ // vectorizing loop preheader block. In future, we may replace this
+ // check with the check for loop preheader.
+ if (Block->getNumPredecessors() == 0)
+ continue;
+
+ // Skip vectorizing loop exit block. In future, we may replace this
+ // check with the check for loop exit.
+ if (Block->getNumSuccessors() == 0)
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ return;
+ }
+
+ assert(!State->Instance && "Replicating a Region with non-null instance.");
+
+ // Enter replicating mode.
+ State->Instance = {0, 0};
+
+ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
+ State->Instance->Part = Part;
+ for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) {
+ State->Instance->Lane = Lane;
+ // Visit the VPBlocks connected to \p this, starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ }
+ }
+
+ // Exit replicating mode.
+ State->Instance.reset();
+}
+
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+ return getParent()->getRecipeList().erase(getIterator());
+}
+
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+ InsertPos->getParent()->getRecipeList().splice(
+ std::next(InsertPos->getIterator()), getParent()->getRecipeList(),
+ getIterator());
+}
+
+void VPInstruction::generateInstruction(VPTransformState &State,
+ unsigned Part) {
+ IRBuilder<> &Builder = State.Builder;
+
+ if (Instruction::isBinaryOp(getOpcode())) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+ State.set(this, V, Part);
+ return;
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *V = Builder.CreateNot(A);
+ State.set(this, V, Part);
+ break;
+ }
+ case VPInstruction::ICmpULE: {
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateICmpULE(IV, TC);
+ State.set(this, V, Part);
+ break;
+ }
+ case Instruction::Select: {
+ Value *Cond = State.get(getOperand(0), Part);
+ Value *Op1 = State.get(getOperand(1), Part);
+ Value *Op2 = State.get(getOperand(2), Part);
+ Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+ State.set(this, V, Part);
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported opcode for instruction");
+ }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ generateInstruction(State, Part);
+}
+
+void VPInstruction::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"EMIT ";
+ print(O);
+ O << "\\l\"";
+}
+
+void VPInstruction::print(raw_ostream &O) const {
+ printAsOperand(O);
+ O << " = ";
+
+ switch (getOpcode()) {
+ case VPInstruction::Not:
+ O << "not";
+ break;
+ case VPInstruction::ICmpULE:
+ O << "icmp ule";
+ break;
+ case VPInstruction::SLPLoad:
+ O << "combined load";
+ break;
+ case VPInstruction::SLPStore:
+ O << "combined store";
+ break;
+ default:
+ O << Instruction::getOpcodeName(getOpcode());
+ }
+
+ for (const VPValue *Operand : operands()) {
+ O << " ";
+ Operand->printAsOperand(O);
+ }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
+ // -1. Check if the backedge taken count is needed, and if so build it.
+ if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+ Value *TC = State->TripCount;
+ IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+ auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+ "trip.count.minus.1");
+ Value2VPValue[TCMO] = BackedgeTakenCount;
+ }
+
+ // 0. Set the reverse mapping from VPValues to Values for code generation.
+ for (auto &Entry : Value2VPValue)
+ State->VPValue2Value[Entry.second] = Entry.first;
+
+ BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
+ BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
+ assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
+
+ // 1. Make room to generate basic-blocks inside loop body if needed.
+ BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
+ VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
+ Loop *L = State->LI->getLoopFor(VectorHeaderBB);
+ L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
+ // Remove the edge between Header and Latch to allow other connections.
+ // Temporarily terminate with unreachable until CFG is rewired.
+ // Note: this asserts the generated code's assumption that
+ // getFirstInsertionPt() can be dereferenced into an Instruction.
+ VectorHeaderBB->getTerminator()->eraseFromParent();
+ State->Builder.SetInsertPoint(VectorHeaderBB);
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+
+ // 2. Generate code in loop body.
+ State->CFG.PrevVPBB = nullptr;
+ State->CFG.PrevBB = VectorHeaderBB;
+ State->CFG.LastBB = VectorLatchBB;
+
+ for (VPBlockBase *Block : depth_first(Entry))
+ Block->execute(State);
+
+ // Setup branch terminator successors for VPBBs in VPBBsToFix based on
+ // VPBB's successors.
+ for (auto VPBB : State->CFG.VPBBsToFix) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected VPBBsToFix in non VPlan-native path");
+ BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
+ assert(BB && "Unexpected null basic block for VPBB");
+
+ unsigned Idx = 0;
+ auto *BBTerminator = BB->getTerminator();
+
+ for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
+ VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
+ BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
+ ++Idx;
+ }
+ }
+
+ // 3. Merge the temporary latch created with the last basic-block filled.
+ BasicBlock *LastBB = State->CFG.PrevBB;
+ // Connect LastBB to VectorLatchBB to facilitate their merge.
+ assert((EnableVPlanNativePath ||
+ isa<UnreachableInst>(LastBB->getTerminator())) &&
+ "Expected InnerLoop VPlan CFG to terminate with unreachable");
+ assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
+ "Expected VPlan CFG to terminate with branch in NativePath");
+ LastBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(VectorLatchBB, LastBB);
+
+ // Merge LastBB with Latch.
+ bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
+ (void)Merged;
+ assert(Merged && "Could not merge last basic block with latch.");
+ VectorLatchBB = LastBB;
+
+ // We do not attempt to preserve DT for outer loop vectorization currently.
+ if (!EnableVPlanNativePath)
+ updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
+}
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB) {
+ BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
+ assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
+ DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB);
+ // The vector body may be more than a single basic-block by this point.
+ // Update the dominator tree information inside the vector body by propagating
+ // it from header to latch, expecting only triangular control-flow, if any.
+ BasicBlock *PostDomSucc = nullptr;
+ for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
+ // Get the list of successors of this block.
+ std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+ assert(Succs.size() <= 2 &&
+ "Basic block in vector loop has more than 2 successors.");
+ PostDomSucc = Succs[0];
+ if (Succs.size() == 1) {
+ assert(PostDomSucc->getSinglePredecessor() &&
+ "PostDom successor has more than one predecessor.");
+ DT->addNewBlock(PostDomSucc, BB);
+ continue;
+ }
+ BasicBlock *InterimSucc = Succs[1];
+ if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
+ PostDomSucc = Succs[1];
+ InterimSucc = Succs[0];
+ }
+ assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
+ "One successor of a basic block does not lead to the other.");
+ assert(InterimSucc->getSinglePredecessor() &&
+ "Interim successor has more than one predecessor.");
+ assert(PostDomSucc->hasNPredecessors(2) &&
+ "PostDom successor has more than two predecessors.");
+ DT->addNewBlock(InterimSucc, BB);
+ DT->addNewBlock(PostDomSucc, BB);
+ }
+}
+
+const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
+ return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
+ Twine(getOrCreateBID(Block));
+}
+
+const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
+ const std::string &Name = Block->getName();
+ if (!Name.empty())
+ return Name;
+ return "VPB" + Twine(getOrCreateBID(Block));
+}
+
+void VPlanPrinter::dump() {
+ Depth = 1;
+ bumpIndent(0);
+ OS << "digraph VPlan {\n";
+ OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
+ if (!Plan.getName().empty())
+ OS << "\\n" << DOT::EscapeString(Plan.getName());
+ if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
+ OS << ", where:";
+ if (Plan.BackedgeTakenCount)
+ OS << "\\n"
+ << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";
+ for (auto Entry : Plan.Value2VPValue) {
+ OS << "\\n" << *Entry.second;
+ OS << DOT::EscapeString(" := ");
+ Entry.first->printAsOperand(OS, false);
+ }
+ }
+ OS << "\"]\n";
+ OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
+ OS << "edge [fontname=Courier, fontsize=30]\n";
+ OS << "compound=true\n";
+
+ for (VPBlockBase *Block : depth_first(Plan.getEntry()))
+ dumpBlock(Block);
+
+ OS << "}\n";
+}
+
+void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
+ if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
+ dumpBasicBlock(BasicBlock);
+ else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ dumpRegion(Region);
+ else
+ llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
+ bool Hidden, const Twine &Label) {
+ // Due to "dot" we print an edge between two regions as an edge between the
+ // exit basic block and the entry basic of the respective regions.
+ const VPBlockBase *Tail = From->getExitBasicBlock();
+ const VPBlockBase *Head = To->getEntryBasicBlock();
+ OS << Indent << getUID(Tail) << " -> " << getUID(Head);
+ OS << " [ label=\"" << Label << '\"';
+ if (Tail != From)
+ OS << " ltail=" << getUID(From);
+ if (Head != To)
+ OS << " lhead=" << getUID(To);
+ if (Hidden)
+ OS << "; splines=none";
+ OS << "]\n";
+}
+
+void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
+ auto &Successors = Block->getSuccessors();
+ if (Successors.size() == 1)
+ drawEdge(Block, Successors.front(), false, "");
+ else if (Successors.size() == 2) {
+ drawEdge(Block, Successors.front(), false, "T");
+ drawEdge(Block, Successors.back(), false, "F");
+ } else {
+ unsigned SuccessorNumber = 0;
+ for (auto *Successor : Successors)
+ drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
+ }
+}
+
+void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
+ OS << Indent << getUID(BasicBlock) << " [label =\n";
+ bumpIndent(1);
+ OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";
+ bumpIndent(1);
+
+ // Dump the block predicate.
+ const VPValue *Pred = BasicBlock->getPredicate();
+ if (Pred) {
+ OS << " +\n" << Indent << " \"BlockPredicate: ";
+ if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
+ PredI->printAsOperand(OS);
+ OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
+ << ")\\l\"";
+ } else
+ Pred->printAsOperand(OS);
+ }
+
+ for (const VPRecipeBase &Recipe : *BasicBlock)
+ Recipe.print(OS, Indent);
+
+ // Dump the condition bit.
+ const VPValue *CBV = BasicBlock->getCondBit();
+ if (CBV) {
+ OS << " +\n" << Indent << " \"CondBit: ";
+ if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+ CBI->printAsOperand(OS);
+ OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+ } else {
+ CBV->printAsOperand(OS);
+ OS << "\"";
+ }
+ }
+
+ bumpIndent(-2);
+ OS << "\n" << Indent << "]\n";
+ dumpEdges(BasicBlock);
+}
+
+void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
+ OS << Indent << "subgraph " << getUID(Region) << " {\n";
+ bumpIndent(1);
+ OS << Indent << "fontname=Courier\n"
+ << Indent << "label=\""
+ << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
+ << DOT::EscapeString(Region->getName()) << "\"\n";
+ // Dump the blocks of the region.
+ assert(Region->getEntry() && "Region contains no inner blocks.");
+ for (const VPBlockBase *Block : depth_first(Region->getEntry()))
+ dumpBlock(Block);
+ bumpIndent(-1);
+ OS << Indent << "}\n";
+ dumpEdges(Region);
+}
+
+void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
+ std::string IngredientString;
+ raw_string_ostream RSO(IngredientString);
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ if (!Inst->getType()->isVoidTy()) {
+ Inst->printAsOperand(RSO, false);
+ RSO << " = ";
+ }
+ RSO << Inst->getOpcodeName() << " ";
+ unsigned E = Inst->getNumOperands();
+ if (E > 0) {
+ Inst->getOperand(0)->printAsOperand(RSO, false);
+ for (unsigned I = 1; I < E; ++I)
+ Inst->getOperand(I)->printAsOperand(RSO << ", ", false);
+ }
+ } else // !Inst
+ V->printAsOperand(RSO, false);
+ RSO.flush();
+ O << DOT::EscapeString(IngredientString);
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN\\l\"";
+ for (auto &Instr : make_range(Begin, End))
+ O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\"";
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
+ const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN-INDUCTION";
+ if (Trunc) {
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\"";
+ } else
+ O << " " << VPlanIngredient(IV) << "\\l\"";
+}
+
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"BLEND ";
+ Phi->printAsOperand(O, false);
+ O << " =";
+ if (!User) {
+ // Not a User of any mask: not really blending, this is a
+ // single-predecessor phi.
+ O << " ";
+ Phi->getIncomingValue(0)->printAsOperand(O, false);
+ } else {
+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
+ O << " ";
+ Phi->getIncomingValue(I)->printAsOperand(O, false);
+ O << "/";
+ User->getOperand(I)->printAsOperand(O);
+ }
+ }
+ O << "\\l\"";
+}
+
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
+ << VPlanIngredient(Ingredient);
+ if (AlsoPack)
+ O << " (S->V)";
+ O << "\\l\"";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
+ << "\\l\"";
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
+ const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+ if (User) {
+ O << ", ";
+ User->getOperand(0)->printAsOperand(O);
+ }
+ O << "\\l\"";
+}
+
+template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
+
+void VPValue::replaceAllUsesWith(VPValue *New) {
+ for (VPUser *User : users())
+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+ if (User->getOperand(I) == this)
+ User->setOperand(I, New);
+}
+
+void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
+ Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+ for (VPBlockBase *Base : RPOT) {
+ visitBlock(Base, Old2New, IAI);
+ }
+}
+
+void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI) {
+ if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
+ for (VPRecipeBase &VPI : *VPBB) {
+ assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
+ auto *VPInst = cast<VPInstruction>(&VPI);
+ auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ auto *IG = IAI.getInterleaveGroup(Inst);
+ if (!IG)
+ continue;
+
+ auto NewIGIter = Old2New.find(IG);
+ if (NewIGIter == Old2New.end())
+ Old2New[IG] = new InterleaveGroup<VPInstruction>(
+ IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));
+
+ if (Inst == IG->getInsertPos())
+ Old2New[IG]->setInsertPos(VPInst);
+
+ InterleaveGroupMap[VPInst] = Old2New[IG];
+ InterleaveGroupMap[VPInst]->insertMember(
+ VPInst, IG->getIndex(Inst),
+ Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+ : IG->getFactor()));
+ }
+ } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ visitRegion(Region, Old2New, IAI);
+ else
+ llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
+ InterleavedAccessInfo &IAI) {
+ Old2NewTy Old2New;
+ visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
new file mode 100644
index 000000000000..44d8a198f27e
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -0,0 +1,1692 @@
+//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of the Vectorization Plan base classes:
+/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
+/// VPBlockBase, together implementing a Hierarchical CFG;
+/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
+/// treated as proper graphs for generic algorithms;
+/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
+/// within VPBasicBlocks;
+/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
+/// instruction;
+/// 5. The VPlan class holding a candidate for vectorization;
+/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
+/// These are documented in docs/VectorizationPlan.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+#include "VPlanLoopInfo.h"
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <map>
+#include <string>
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+template <class T> class InterleaveGroup;
+class LoopInfo;
+class raw_ostream;
+class Value;
+class VPBasicBlock;
+class VPRegionBlock;
+class VPlan;
+class VPlanSlp;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+ // A power of 2.
+ const unsigned Start;
+
+ // Need not be a power of 2. If End <= Start range is empty.
+ unsigned End;
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+ /// in [0..UF)
+ unsigned Part;
+
+ /// in [0..VF)
+ unsigned Lane;
+};
+
+/// This is a helper struct for maintaining vectorization state. It's used for
+/// mapping values from the original loop to their corresponding values in
+/// the new loop. Two mappings are maintained: one for vectorized values and
+/// one for scalarized values. Vectorized values are represented with UF
+/// vector values in the new loop, and scalarized values are represented with
+/// UF x VF scalar values in the new loop. UF and VF are the unroll and
+/// vectorization factors, respectively.
+///
+/// Entries can be added to either map with setVectorValue and setScalarValue,
+/// which assert that an entry was not already added before. If an entry is to
+/// replace an existing one, call resetVectorValue and resetScalarValue. This is
+/// currently needed to modify the mapped values during "fix-up" operations that
+/// occur once the first phase of widening is complete. These operations include
+/// type truncation and the second phase of recurrence widening.
+///
+/// Entries from either map can be retrieved using the getVectorValue and
+/// getScalarValue functions, which assert that the desired value exists.
+struct VectorizerValueMap {
+ friend struct VPTransformState;
+
+private:
+ /// The unroll factor. Each entry in the vector map contains UF vector values.
+ unsigned UF;
+
+ /// The vectorization factor. Each entry in the scalar map contains UF x VF
+ /// scalar values.
+ unsigned VF;
+
+ /// The vector and scalar map storage. We use std::map and not DenseMap
+ /// because insertions to DenseMap invalidate its iterators.
+ using VectorParts = SmallVector<Value *, 2>;
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+ std::map<Value *, VectorParts> VectorMapStorage;
+ std::map<Value *, ScalarParts> ScalarMapStorage;
+
+public:
+ /// Construct an empty map with the given unroll and vectorization factors.
+ VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+ /// \return True if the map has any vector entry for \p Key.
+ bool hasAnyVectorValue(Value *Key) const {
+ return VectorMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a vector entry for \p Key and \p Part.
+ bool hasVectorValue(Value *Key, unsigned Part) const {
+ assert(Part < UF && "Queried Vector Part is too large.");
+ if (!hasAnyVectorValue(Key))
+ return false;
+ const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+ return Entry[Part] != nullptr;
+ }
+
+ /// \return True if the map has any scalar entry for \p Key.
+ bool hasAnyScalarValue(Value *Key) const {
+ return ScalarMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a scalar entry for \p Key and \p Instance.
+ bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
+ assert(Instance.Part < UF && "Queried Scalar Part is too large.");
+ assert(Instance.Lane < VF && "Queried Scalar Lane is too large.");
+ if (!hasAnyScalarValue(Key))
+ return false;
+ const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+ assert(Entry[Instance.Part].size() == VF &&
+ "ScalarParts has wrong dimensions.");
+ return Entry[Instance.Part][Instance.Lane] != nullptr;
+ }
+
+ /// Retrieve the existing vector value that corresponds to \p Key and
+ /// \p Part.
+ Value *getVectorValue(Value *Key, unsigned Part) {
+ assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+ return VectorMapStorage[Key][Part];
+ }
+
+ /// Retrieve the existing scalar value that corresponds to \p Key and
+ /// \p Instance.
+ Value *getScalarValue(Value *Key, const VPIteration &Instance) {
+ assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
+ return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+ }
+
+ /// Set a vector value associated with \p Key and \p Part. Assumes such a
+ /// value is not already set. If it is, use resetVectorValue() instead.
+ void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+ if (!VectorMapStorage.count(Key)) {
+ VectorParts Entry(UF);
+ VectorMapStorage[Key] = Entry;
+ }
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
+ /// value is not already set.
+ void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
+ assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
+ if (!ScalarMapStorage.count(Key)) {
+ ScalarParts Entry(UF);
+ // TODO: Consider storing uniform values only per-part, as they occupy
+ // lane 0 only, keeping the other VF-1 redundant entries null.
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part].resize(VF, nullptr);
+ ScalarMapStorage[Key] = Entry;
+ }
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+
+ /// Reset the vector value associated with \p Key for the given \p Part.
+ /// This function can be used to update values that have already been
+ /// vectorized. This is the case for "fix-up" operations including type
+ /// truncation and the second phase of recurrence vectorization.
+ void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
+ /// This function can be used to update values that have already been
+ /// scalarized. This is the case for "fix-up" operations including scalar phi
+ /// nodes for scalarized and predicated instructions.
+ void resetScalarValue(Value *Key, const VPIteration &Instance,
+ Value *Scalar) {
+ assert(hasScalarValue(Key, Instance) &&
+ "Scalar value not set for part and lane");
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+};
+
+/// This class is used to enable the VPlan to invoke a method of ILV. This is
+/// needed until the method is refactored out of ILV and becomes reusable.
+struct VPCallback {
+ virtual ~VPCallback() {}
+ virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
+ VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT,
+ IRBuilder<> &Builder, VectorizerValueMap &ValueMap,
+ InnerLoopVectorizer *ILV, VPCallback &Callback)
+ : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder),
+ ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+
+ /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
+ unsigned VF;
+ unsigned UF;
+
+ /// Hold the indices to generate specific scalar instructions. Null indicates
+ /// that all instances are to be generated, using either scalar or vector
+ /// instructions.
+ Optional<VPIteration> Instance;
+
+ struct DataState {
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in
+ /// the new unrolled loop, where UF is the unroll factor.
+ typedef SmallVector<Value *, 2> PerPartValuesTy;
+
+ DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
+ } Data;
+
+ /// Get the generated Value for a given VPValue and a given Part. Note that
+ /// as some Defs are still created by ILV and managed in its ValueMap, this
+ /// method will delegate the call to ILV in such cases in order to provide
+ /// callers a consistent API.
+ /// \see set.
+ Value *get(VPValue *Def, unsigned Part) {
+ // If Values have been set for this Def return the one relevant for \p Part.
+ if (Data.PerPartOutput.count(Def))
+ return Data.PerPartOutput[Def][Part];
+ // Def is managed by ILV: bring the Values from ValueMap.
+ return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
+ }
+
+ /// Set the generated Value for a given VPValue and a given Part.
+ void set(VPValue *Def, Value *V, unsigned Part) {
+ if (!Data.PerPartOutput.count(Def)) {
+ DataState::PerPartValuesTy Entry(UF);
+ Data.PerPartOutput[Def] = Entry;
+ }
+ Data.PerPartOutput[Def][Part] = V;
+ }
+
+ /// Hold state information used when constructing the CFG of the output IR,
+ /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+ struct CFGState {
+ /// The previous VPBasicBlock visited. Initially set to null.
+ VPBasicBlock *PrevVPBB = nullptr;
+
+ /// The previous IR BasicBlock created or used. Initially set to the new
+ /// header BasicBlock.
+ BasicBlock *PrevBB = nullptr;
+
+ /// The last IR BasicBlock in the output IR. Set to the new latch
+ /// BasicBlock, used for placing the newly created BasicBlocks.
+ BasicBlock *LastBB = nullptr;
+
+ /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+ /// of replication, maps the BasicBlock of the last replica created.
+ SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+ /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
+ /// up at the end of vector code generation.
+ SmallVector<VPBasicBlock *, 8> VPBBsToFix;
+
+ CFGState() = default;
+ } CFG;
+
+ /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+ LoopInfo *LI;
+
+ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
+ DominatorTree *DT;
+
+ /// Hold a reference to the IRBuilder used to generate output IR code.
+ IRBuilder<> &Builder;
+
+ /// Hold a reference to the Value state information used when generating the
+ /// Values of the output IR.
+ VectorizerValueMap &ValueMap;
+
+ /// Hold a reference to a mapping between VPValues in VPlan and original
+ /// Values they correspond to.
+ VPValue2ValueTy VPValue2Value;
+
+ /// Hold the trip count of the scalar loop.
+ Value *TripCount = nullptr;
+
+ /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+ InnerLoopVectorizer *ILV;
+
+ VPCallback &Callback;
+};
+
+/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
+/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
+class VPBlockBase {
+ friend class VPBlockUtils;
+
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// An optional name for the block.
+ std::string Name;
+
+ /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
+ /// it is a topmost VPBlockBase.
+ VPRegionBlock *Parent = nullptr;
+
+ /// List of predecessor blocks.
+ SmallVector<VPBlockBase *, 1> Predecessors;
+
+ /// List of successor blocks.
+ SmallVector<VPBlockBase *, 1> Successors;
+
+ /// Successor selector, null for zero or single successor blocks.
+ VPValue *CondBit = nullptr;
+
+ /// Current block predicate - null if the block does not need a predicate.
+ VPValue *Predicate = nullptr;
+
+ /// Add \p Successor as the last successor to this block.
+ void appendSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Cannot add nullptr successor!");
+ Successors.push_back(Successor);
+ }
+
+ /// Add \p Predecessor as the last predecessor to this block.
+ void appendPredecessor(VPBlockBase *Predecessor) {
+ assert(Predecessor && "Cannot add nullptr predecessor!");
+ Predecessors.push_back(Predecessor);
+ }
+
+ /// Remove \p Predecessor from the predecessors of this block.
+ void removePredecessor(VPBlockBase *Predecessor) {
+ auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor);
+ assert(Pos && "Predecessor does not exist");
+ Predecessors.erase(Pos);
+ }
+
+ /// Remove \p Successor from the successors of this block.
+ void removeSuccessor(VPBlockBase *Successor) {
+ auto Pos = std::find(Successors.begin(), Successors.end(), Successor);
+ assert(Pos && "Successor does not exist");
+ Successors.erase(Pos);
+ }
+
+protected:
+ VPBlockBase(const unsigned char SC, const std::string &N)
+ : SubclassID(SC), Name(N) {}
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPBlockBase
+ /// that are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPBlockBase objects. They are used for concrete
+ /// type identification.
+ using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
+
+ using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
+
+ virtual ~VPBlockBase() = default;
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPBlockID() const { return SubclassID; }
+
+ VPRegionBlock *getParent() { return Parent; }
+ const VPRegionBlock *getParent() const { return Parent; }
+
+ void setParent(VPRegionBlock *P) { Parent = P; }
+
+ /// \return the VPBasicBlock that is the entry of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getEntryBasicBlock() const;
+ VPBasicBlock *getEntryBasicBlock();
+
+ /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getExitBasicBlock() const;
+ VPBasicBlock *getExitBasicBlock();
+
+ const VPBlocksTy &getSuccessors() const { return Successors; }
+ VPBlocksTy &getSuccessors() { return Successors; }
+
+ const VPBlocksTy &getPredecessors() const { return Predecessors; }
+ VPBlocksTy &getPredecessors() { return Predecessors; }
+
+ /// \return the successor of this VPBlockBase if it has a single successor.
+ /// Otherwise return a null pointer.
+ VPBlockBase *getSingleSuccessor() const {
+ return (Successors.size() == 1 ? *Successors.begin() : nullptr);
+ }
+
+ /// \return the predecessor of this VPBlockBase if it has a single
+ /// predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSinglePredecessor() const {
+ return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
+ }
+
+ size_t getNumSuccessors() const { return Successors.size(); }
+ size_t getNumPredecessors() const { return Predecessors.size(); }
+
+ /// An Enclosing Block of a block B is any block containing B, including B
+ /// itself. \return the closest enclosing block starting from "this", which
+ /// has successors. \return the root enclosing block if all enclosing blocks
+ /// have no successors.
+ VPBlockBase *getEnclosingBlockWithSuccessors();
+
+ /// \return the closest enclosing block starting from "this", which has
+ /// predecessors. \return the root enclosing block if all enclosing blocks
+ /// have no predecessors.
+ VPBlockBase *getEnclosingBlockWithPredecessors();
+
+ /// \return the successors either attached directly to this VPBlockBase or, if
+ /// this VPBlockBase is the exit block of a VPRegionBlock and has no
+ /// successors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has successors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) successors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalSuccessors() {
+ return getEnclosingBlockWithSuccessors()->getSuccessors();
+ }
+
+ /// \return the hierarchical successor of this VPBlockBase if it has a single
+ /// hierarchical successor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalSuccessor() {
+ return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
+ }
+
+ /// \return the predecessors either attached directly to this VPBlockBase or,
+ /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
+ /// predecessors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has predecessors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) predecessors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalPredecessors() {
+ return getEnclosingBlockWithPredecessors()->getPredecessors();
+ }
+
+ /// \return the hierarchical predecessor of this VPBlockBase if it has a
+ /// single hierarchical predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalPredecessor() {
+ return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
+ }
+
+ /// \return the condition bit selecting the successor.
+ VPValue *getCondBit() { return CondBit; }
+
+ const VPValue *getCondBit() const { return CondBit; }
+
+ void setCondBit(VPValue *CV) { CondBit = CV; }
+
+ VPValue *getPredicate() { return Predicate; }
+
+ const VPValue *getPredicate() const { return Predicate; }
+
+ void setPredicate(VPValue *Pred) { Predicate = Pred; }
+
+ /// Set a given VPBlockBase \p Successor as the single successor of this
+ /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+ /// This VPBlockBase must have no successors.
+ void setOneSuccessor(VPBlockBase *Successor) {
+ assert(Successors.empty() && "Setting one successor when others exist.");
+ appendSuccessor(Successor);
+ }
+
+ /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors of this VPBlockBase. \p Condition is set as the successor
+ /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+ /// IfFalse. This VPBlockBase must have no successors.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition) {
+ assert(Successors.empty() && "Setting two successors when others exist.");
+ assert(Condition && "Setting two successors without condition!");
+ CondBit = Condition;
+ appendSuccessor(IfTrue);
+ appendSuccessor(IfFalse);
+ }
+
+ /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+ /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+ /// as successor of any VPBasicBlock in \p NewPreds.
+ void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+ assert(Predecessors.empty() && "Block predecessors already set.");
+ for (auto *Pred : NewPreds)
+ appendPredecessor(Pred);
+ }
+
+ /// Remove all the predecessor of this block.
+ void clearPredecessors() { Predecessors.clear(); }
+
+ /// Remove all the successors of this block and set to null its condition bit
+ void clearSuccessors() {
+ Successors.clear();
+ CondBit = nullptr;
+ }
+
+ /// The method which generates the output IR that correspond to this
+ /// VPBlockBase, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState *State) = 0;
+
+ /// Delete all blocks reachable from a given VPBlockBase, inclusive.
+ static void deleteCFG(VPBlockBase *Entry);
+
+ void printAsOperand(raw_ostream &OS, bool PrintType) const {
+ OS << getName();
+ }
+
+ void print(raw_ostream &OS) const {
+ // TODO: Only printing VPBB name for now since we only have dot printing
+ // support for VPInstructions/Recipes.
+ printAsOperand(OS, false);
+ }
+
+ /// Return true if it is legal to hoist instructions into this block.
+ bool isLegalToHoistInto() {
+ // There are currently no constraints that prevent an instruction to be
+ // hoisted into a VPBlockBase.
+ return true;
+ }
+};
+
+/// VPRecipeBase is a base class modeling a sequence of one or more output IR
+/// instructions.
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
+ friend VPBasicBlock;
+
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// Each VPRecipe belongs to a single VPBasicBlock.
+ VPBasicBlock *Parent = nullptr;
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
+ /// that is actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPRecipeBase objects. They are used for concrete
+ /// type identification.
+ using VPRecipeTy = enum {
+ VPBlendSC,
+ VPBranchOnMaskSC,
+ VPInstructionSC,
+ VPInterleaveSC,
+ VPPredInstPHISC,
+ VPReplicateSC,
+ VPWidenIntOrFpInductionSC,
+ VPWidenMemoryInstructionSC,
+ VPWidenPHISC,
+ VPWidenSC,
+ };
+
+ VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
+ virtual ~VPRecipeBase() = default;
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPRecipeID() const { return SubclassID; }
+
+ /// \return the VPBasicBlock which this VPRecipe belongs to.
+ VPBasicBlock *getParent() { return Parent; }
+ const VPBasicBlock *getParent() const { return Parent; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRecipe, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState &State) = 0;
+
+ /// Each recipe prints itself.
+ virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+
+ /// Insert an unlinked recipe into a basic block immediately before
+ /// the specified recipe.
+ void insertBefore(VPRecipeBase *InsertPos);
+
+ /// Unlink this recipe from its current VPBasicBlock and insert it into
+ /// the VPBasicBlock that MovePos lives in, right after MovePos.
+ void moveAfter(VPRecipeBase *MovePos);
+
+ /// This method unlinks 'this' from the containing basic block and deletes it.
+ ///
+ /// \returns an iterator pointing to the element after the erased one
+ iplist<VPRecipeBase>::iterator eraseFromParent();
+};
+
+/// This is a concrete Recipe that models a single VPlan-level instruction.
+/// While as any Recipe it may generate a sequence of IR instructions when
+/// executed, these instructions would always form a single-def expression as
+/// the VPInstruction is also a single def-use vertex.
+class VPInstruction : public VPUser, public VPRecipeBase {
+ friend class VPlanHCFGTransforms;
+ friend class VPlanSlp;
+
+public:
+ /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
+ enum {
+ Not = Instruction::OtherOpsEnd + 1,
+ ICmpULE,
+ SLPLoad,
+ SLPStore,
+ };
+
+private:
+ typedef unsigned char OpcodeTy;
+ OpcodeTy Opcode;
+
+ /// Utility method serving execute(): generates a single instance of the
+ /// modeled instruction.
+ void generateInstruction(VPTransformState &State, unsigned Part);
+
+protected:
+ Instruction *getUnderlyingInstr() {
+ return cast_or_null<Instruction>(getUnderlyingValue());
+ }
+
+ void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
+public:
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+ : VPUser(VPValue::VPInstructionSC, Operands),
+ VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPInstructionSC;
+ }
+
+ VPInstruction *clone() const {
+ SmallVector<VPValue *, 2> Operands(operands());
+ return new VPInstruction(Opcode, Operands);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
+ }
+
+ unsigned getOpcode() const { return Opcode; }
+
+ /// Generate the instruction.
+ /// TODO: We currently execute only per-part unless a specific instance is
+ /// provided.
+ void execute(VPTransformState &State) override;
+
+ /// Print the Recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+
+ /// Print the VPInstruction.
+ void print(raw_ostream &O) const;
+
+ /// Return true if this instruction may modify memory.
+ bool mayWriteToMemory() const {
+ // TODO: we can use attributes of the called function to rule out memory
+ // modifications.
+ return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+ Opcode == Instruction::Invoke || Opcode == SLPStore;
+ }
+};
+
+/// VPWidenRecipe is a recipe for producing a copy of vector type for each
+/// Instruction in its ingredients independently, in order. This recipe covers
+/// most of the traditional vectorization cases where each ingredient transforms
+/// into a vectorized version of itself.
+class VPWidenRecipe : public VPRecipeBase {
+private:
+ /// Hold the ingredients by pointing to their original BasicBlock location.
+ BasicBlock::iterator Begin;
+ BasicBlock::iterator End;
+
+public:
+ VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
+ End = I->getIterator();
+ Begin = End++;
+ }
+
+ ~VPWidenRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
+ }
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override;
+
+ /// Augment the recipe to include Instr, if it lies at its End.
+ bool appendInstruction(Instruction *Instr) {
+ if (End != Instr->getIterator())
+ return false;
+ End++;
+ return true;
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector and scalar values.
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+private:
+ PHINode *IV;
+ TruncInst *Trunc;
+
+public:
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
+ ~VPWidenIntOrFpInductionRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+ }
+
+ /// Generate the vectorized and scalarized versions of the phi node as
+ /// needed by their users.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for handling all phi nodes except for integer and FP inductions.
+class VPWidenPHIRecipe : public VPRecipeBase {
+private:
+ PHINode *Phi;
+
+public:
+ VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
+ ~VPWidenPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
+class VPBlendRecipe : public VPRecipeBase {
+private:
+ PHINode *Phi;
+
+ /// The blend operation is a User of a mask, if not null.
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks)
+ : VPRecipeBase(VPBlendSC), Phi(Phi) {
+ assert((Phi->getNumIncomingValues() == 1 ||
+ Phi->getNumIncomingValues() == Masks.size()) &&
+ "Expected the same number of incoming values and masks");
+ if (!Masks.empty())
+ User.reset(new VPUser(Masks));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles.
+class VPInterleaveRecipe : public VPRecipeBase {
+private:
+ const InterleaveGroup<Instruction> *IG;
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask)
+ : VPRecipeBase(VPInterleaveSC), IG(IG) {
+ if (Mask) // Create a VPInstruction to register as a user of the mask.
+ User.reset(new VPUser({Mask}));
+ }
+ ~VPInterleaveRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
+ }
+
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+
+ const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+};
+
+/// VPReplicateRecipe replicates a given instruction producing multiple scalar
+/// copies of the original scalar type, one per lane, instead of producing a
+/// single copy of widened type for all lanes. If the instruction is known to be
+/// uniform only one copy, per lane zero, will be generated.
+class VPReplicateRecipe : public VPRecipeBase {
+private:
+ /// The instruction being replicated.
+ Instruction *Ingredient;
+
+ /// Indicator if only a single replica per lane is needed.
+ bool IsUniform;
+
+ /// Indicator if the replicas are also predicated.
+ bool IsPredicated;
+
+ /// Indicator if the scalar values should also be packed into a vector.
+ bool AlsoPack;
+
+public:
+ VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
+ : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
+ IsPredicated(IsPredicated) {
+ // Retain the previous behavior of predicateInstructions(), where an
+ // insert-element of a predicated instruction got hoisted into the
+ // predicated basic block iff it was its only user. This is achieved by
+ // having predicated instructions also pack their values into a vector by
+ // default unless they have a replicated user which uses their scalar value.
+ AlsoPack = IsPredicated && !I->use_empty();
+ }
+
+ ~VPReplicateRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
+ }
+
+ /// Generate replicas of the desired Ingredient. Replicas will be generated
+ /// for all parts and lanes unless a specific part and lane are specified in
+ /// the \p State.
+ void execute(VPTransformState &State) override;
+
+ void setAlsoPack(bool Pack) { AlsoPack = Pack; }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for generating conditional branches on the bits of a mask.
+class VPBranchOnMaskRecipe : public VPRecipeBase {
+private:
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
+ if (BlockInMask) // nullptr means all-one mask.
+ User.reset(new VPUser({BlockInMask}));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
+ }
+
+ /// Generate the extraction of the appropriate bit from the block mask and the
+ /// conditional branch.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+ if (User)
+ O << *User->getOperand(0);
+ else
+ O << " All-One";
+ O << "\\l\"";
+ }
+};
+
+/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
+/// control converges back from a Branch-on-Mask. The phi nodes are needed in
+/// order to merge values that are set under such a branch and feed their uses.
+/// The phi nodes can be scalar or vector depending on the users of the value.
+/// This recipe works in concert with VPBranchOnMaskRecipe.
+class VPPredInstPHIRecipe : public VPRecipeBase {
+private:
+ Instruction *PredInst;
+
+public:
+ /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
+ /// nodes after merging back from a Branch-on-Mask.
+ VPPredInstPHIRecipe(Instruction *PredInst)
+ : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
+ ~VPPredInstPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
+ }
+
+ /// Generates phi nodes for live-outs as needed to retain SSA form.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A Recipe for widening load/store operations.
+/// TODO: We currently execute only per-part unless a specific instance is
+/// provided.
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+private:
+ Instruction &Instr;
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask)
+ : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {
+ if (Mask) // Create a VPInstruction to register as a user of the mask.
+ User.reset(new VPUser({Mask}));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+ }
+
+ /// Generate the wide load/store.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
+/// holds a sequence of zero or more VPRecipe's each representing a sequence of
+/// output IR instructions.
+class VPBasicBlock : public VPBlockBase {
+public:
+ using RecipeListTy = iplist<VPRecipeBase>;
+
+private:
+ /// The VPRecipes held in the order of output instructions to generate.
+ RecipeListTy Recipes;
+
+public:
+ VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+ : VPBlockBase(VPBasicBlockSC, Name.str()) {
+ if (Recipe)
+ appendRecipe(Recipe);
+ }
+
+ ~VPBasicBlock() override { Recipes.clear(); }
+
+ /// Instruction iterators...
+ using iterator = RecipeListTy::iterator;
+ using const_iterator = RecipeListTy::const_iterator;
+ using reverse_iterator = RecipeListTy::reverse_iterator;
+ using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
+
+ //===--------------------------------------------------------------------===//
+ /// Recipe iterator methods
+ ///
+ inline iterator begin() { return Recipes.begin(); }
+ inline const_iterator begin() const { return Recipes.begin(); }
+ inline iterator end() { return Recipes.end(); }
+ inline const_iterator end() const { return Recipes.end(); }
+
+ inline reverse_iterator rbegin() { return Recipes.rbegin(); }
+ inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
+ inline reverse_iterator rend() { return Recipes.rend(); }
+ inline const_reverse_iterator rend() const { return Recipes.rend(); }
+
+ inline size_t size() const { return Recipes.size(); }
+ inline bool empty() const { return Recipes.empty(); }
+ inline const VPRecipeBase &front() const { return Recipes.front(); }
+ inline VPRecipeBase &front() { return Recipes.front(); }
+ inline const VPRecipeBase &back() const { return Recipes.back(); }
+ inline VPRecipeBase &back() { return Recipes.back(); }
+
+ /// Returns a reference to the list of recipes.
+ RecipeListTy &getRecipeList() { return Recipes; }
+
+ /// Returns a pointer to a member of the recipe list.
+ static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
+ return &VPBasicBlock::Recipes;
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
+ }
+
+ void insert(VPRecipeBase *Recipe, iterator InsertPt) {
+ assert(Recipe && "No recipe to append.");
+ assert(!Recipe->Parent && "Recipe already in VPlan");
+ Recipe->Parent = this;
+ Recipes.insert(InsertPt, Recipe);
+ }
+
+ /// Augment the existing recipes of a VPBasicBlock with an additional
+ /// \p Recipe as the last recipe.
+ void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPBasicBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+
+private:
+ /// Create an IR BasicBlock to hold the output instructions generated by this
+ /// VPBasicBlock, and return it. Update the CFGState accordingly.
+ BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+};
+
+/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
+/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// A VPRegionBlock may indicate that its contents are to be replicated several
+/// times. This is designed to support predicated scalarization, in which a
+/// scalar if-then code structure needs to be generated VF * UF times. Having
+/// this replication indicator helps to keep a single model for multiple
+/// candidate VF's. The actual replication takes place only once the desired VF
+/// and UF have been determined.
+class VPRegionBlock : public VPBlockBase {
+private:
+ /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Entry;
+
+ /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Exit;
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool IsReplicator;
+
+public:
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+ const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+ IsReplicator(IsReplicator) {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exit->setParent(this);
+ }
+ VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+ IsReplicator(IsReplicator) {}
+
+ ~VPRegionBlock() override {
+ if (Entry)
+ deleteCFG(Entry);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
+ }
+
+ const VPBlockBase *getEntry() const { return Entry; }
+ VPBlockBase *getEntry() { return Entry; }
+
+ /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+ /// EntryBlock must have no predecessors.
+ void setEntry(VPBlockBase *EntryBlock) {
+ assert(EntryBlock->getPredecessors().empty() &&
+ "Entry block cannot have predecessors.");
+ Entry = EntryBlock;
+ EntryBlock->setParent(this);
+ }
+
+ // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a
+ // specific interface of llvm::Function, instead of using
+ // GraphTraints::getEntryNode. We should add a new template parameter to
+ // DominatorTreeBase representing the Graph type.
+ VPBlockBase &front() const { return *Entry; }
+
+ const VPBlockBase *getExit() const { return Exit; }
+ VPBlockBase *getExit() { return Exit; }
+
+ /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+ /// ExitBlock must have no successors.
+ void setExit(VPBlockBase *ExitBlock) {
+ assert(ExitBlock->getSuccessors().empty() &&
+ "Exit block cannot have successors.");
+ Exit = ExitBlock;
+ ExitBlock->setParent(this);
+ }
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool isReplicator() const { return IsReplicator; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRegionBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+};
+
+/// VPlan models a candidate for vectorization, encoding various decisions take
+/// to produce efficient output IR, including which branches, basic-blocks and
+/// output IR instructions to generate, and their cost. VPlan holds a
+/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
+/// VPBlock.
+class VPlan {
+ friend class VPlanPrinter;
+
+private:
+ /// Hold the single entry to the Hierarchical CFG of the VPlan.
+ VPBlockBase *Entry;
+
+ /// Holds the VFs applicable to this VPlan.
+ SmallSet<unsigned, 2> VFs;
+
+ /// Holds the name of the VPlan, for printing.
+ std::string Name;
+
+ /// Holds all the external definitions created for this VPlan.
+ // TODO: Introduce a specific representation for external definitions in
+ // VPlan. External definitions must be immutable and hold a pointer to its
+ // underlying IR that will be used to implement its structural comparison
+ // (operators '==' and '<').
+ SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
+ /// Represents the backedge taken count of the original loop, for folding
+ /// the tail.
+ VPValue *BackedgeTakenCount = nullptr;
+
+ /// Holds a mapping between Values and their corresponding VPValue inside
+ /// VPlan.
+ Value2VPValueTy Value2VPValue;
+
+ /// Holds the VPLoopInfo analysis for this VPlan.
+ VPLoopInfo VPLInfo;
+
+ /// Holds the condition bit values built during VPInstruction to VPRecipe transformation.
+ SmallVector<VPValue *, 4> VPCBVs;
+
+public:
+ VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+
+ ~VPlan() {
+ if (Entry)
+ VPBlockBase::deleteCFG(Entry);
+ for (auto &MapEntry : Value2VPValue)
+ if (MapEntry.second != BackedgeTakenCount)
+ delete MapEntry.second;
+ if (BackedgeTakenCount)
+ delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
+ for (VPValue *Def : VPExternalDefs)
+ delete Def;
+ for (VPValue *CBV : VPCBVs)
+ delete CBV;
+ }
+
+ /// Generate the IR code for this VPlan.
+ void execute(struct VPTransformState *State);
+
+ VPBlockBase *getEntry() { return Entry; }
+ const VPBlockBase *getEntry() const { return Entry; }
+
+ VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+
+ /// The backedge taken count of the original loop.
+ VPValue *getOrCreateBackedgeTakenCount() {
+ if (!BackedgeTakenCount)
+ BackedgeTakenCount = new VPValue();
+ return BackedgeTakenCount;
+ }
+
+ void addVF(unsigned VF) { VFs.insert(VF); }
+
+ bool hasVF(unsigned VF) { return VFs.count(VF); }
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// Add \p VPVal to the pool of external definitions if it's not already
+ /// in the pool.
+ void addExternalDef(VPValue *VPVal) {
+ VPExternalDefs.insert(VPVal);
+ }
+
+ /// Add \p CBV to the vector of condition bit values.
+ void addCBV(VPValue *CBV) {
+ VPCBVs.push_back(CBV);
+ }
+
+ void addVPValue(Value *V) {
+ assert(V && "Trying to add a null Value to VPlan");
+ assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
+ Value2VPValue[V] = new VPValue();
+ }
+
+ VPValue *getVPValue(Value *V) {
+ assert(V && "Trying to get the VPValue of a null Value");
+ assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
+ return Value2VPValue[V];
+ }
+
+ /// Return the VPLoopInfo analysis for this VPlan.
+ VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
+ const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
+
+private:
+ /// Add to the given dominator tree the header block and every new basic block
+ /// that was created between it and the latch block, inclusive.
+ static void updateDominatorTree(DominatorTree *DT,
+ BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB);
+};
+
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+ friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const struct VPlanIngredient &I);
+
+private:
+ raw_ostream &OS;
+ VPlan &Plan;
+ unsigned Depth;
+ unsigned TabWidth = 2;
+ std::string Indent;
+ unsigned BID = 0;
+ SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+ VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}
+
+ /// Handle indentation.
+ void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+ /// Print a given \p Block of the Plan.
+ void dumpBlock(const VPBlockBase *Block);
+
+ /// Print the information related to the CFG edges going out of a given
+ /// \p Block, followed by printing the successor blocks themselves.
+ void dumpEdges(const VPBlockBase *Block);
+
+ /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+ /// its successor blocks.
+ void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+ /// Print a given \p Region of the Plan.
+ void dumpRegion(const VPRegionBlock *Region);
+
+ unsigned getOrCreateBID(const VPBlockBase *Block) {
+ return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+ }
+
+ const Twine getOrCreateName(const VPBlockBase *Block);
+
+ const Twine getUID(const VPBlockBase *Block);
+
+ /// Print the information related to a CFG edge between two VPBlockBases.
+ void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+ const Twine &Label);
+
+ void dump();
+
+ static void printAsIngredient(raw_ostream &O, Value *V);
+};
+
+struct VPlanIngredient {
+ Value *V;
+
+ VPlanIngredient(Value *V) : V(V) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
+ VPlanPrinter::printAsIngredient(OS, I.V);
+ return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {
+ VPlanPrinter Printer(OS, Plan);
+ Printer.dump();
+ return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //
+//===----------------------------------------------------------------------===//
+
+// The following set of template specializations implement GraphTraits to treat
+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
+// VPBlockBase is a VPRegionBlock, this specialization provides access to its
+// successors/predecessors but not to the blocks inside the region.
+
+template <> struct GraphTraits<VPBlockBase *> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+ using NodeRef = const VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
+// of successors for the inverse traversal.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getPredecessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getPredecessors().end();
+ }
+};
+
+// The following set of template specializations implement GraphTraits to
+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
+// there won't be automatic recursion into other VPBlockBases that turn to be
+// VPRegionBlocks.
+
+template <>
+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
+ using GraphRef = VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getEntry());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+template <>
+struct GraphTraits<const VPRegionBlock *>
+ : public GraphTraits<const VPBlockBase *> {
+ using GraphRef = const VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getEntry());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+template <>
+struct GraphTraits<Inverse<VPRegionBlock *>>
+ : public GraphTraits<Inverse<VPBlockBase *>> {
+ using GraphRef = VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(Inverse<GraphRef> N) {
+ return N.Graph->getExit();
+ }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getExit());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+ VPBlockUtils() = delete;
+
+ /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+ /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+ /// has more than one successor, its conditional bit is propagated to \p
+ /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+ assert(NewBlock->getSuccessors().empty() &&
+ "Can't insert new block with successors.");
+ // TODO: move successors from BlockPtr to NewBlock when this functionality
+ // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+ // already has successors.
+ BlockPtr->setOneSuccessor(NewBlock);
+ NewBlock->setPredecessors({BlockPtr});
+ NewBlock->setParent(BlockPtr->getParent());
+ }
+
+ /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+ /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+ /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+ /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+ /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+ /// must have neither successors nor predecessors.
+ static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition, VPBlockBase *BlockPtr) {
+ assert(IfTrue->getSuccessors().empty() &&
+ "Can't insert IfTrue with successors.");
+ assert(IfFalse->getSuccessors().empty() &&
+ "Can't insert IfFalse with successors.");
+ BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+ IfTrue->setPredecessors({BlockPtr});
+ IfFalse->setPredecessors({BlockPtr});
+ IfTrue->setParent(BlockPtr->getParent());
+ IfFalse->setParent(BlockPtr->getParent());
+ }
+
+ /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+ /// the successors of \p From and \p From to the predecessors of \p To. Both
+ /// VPBlockBases must have the same parent, which can be null. Both
+ /// VPBlockBases can be already connected to other VPBlockBases.
+ static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert((From->getParent() == To->getParent()) &&
+ "Can't connect two block with different parents");
+ assert(From->getNumSuccessors() < 2 &&
+ "Blocks can't have more than two successors.");
+ From->appendSuccessor(To);
+ To->appendPredecessor(From);
+ }
+
+ /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+ /// from the successors of \p From and \p From from the predecessors of \p To.
+ static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert(To && "Successor to disconnect is null.");
+ From->removeSuccessor(To);
+ To->removePredecessor(From);
+ }
+
+ /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
+ static bool isBackEdge(const VPBlockBase *FromBlock,
+ const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
+ assert(FromBlock->getParent() == ToBlock->getParent() &&
+ FromBlock->getParent() && "Must be in same region");
+ const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
+ const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
+ if (!FromLoop || !ToLoop || FromLoop != ToLoop)
+ return false;
+
+ // A back-edge is a branch from the loop latch to its header.
+ return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
+ }
+
+ /// Returns true if \p Block is a loop latch
+ static bool blockIsLoopLatch(const VPBlockBase *Block,
+ const VPLoopInfo *VPLInfo) {
+ if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
+ return ParentVPL->isLoopLatch(Block);
+
+ return false;
+ }
+
+ /// Count and return the number of succesors of \p PredBlock excluding any
+ /// backedges.
+ static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
+ VPLoopInfo *VPLI) {
+ unsigned Count = 0;
+ for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
+ if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
+ Count++;
+ }
+ return Count;
+ }
+};
+
+class VPInterleavedAccessInfo {
+private:
+ DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
+ InterleaveGroupMap;
+
+ /// Type for mapping of instruction based interleave groups to VPInstruction
+ /// interleave groups
+ using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
+ InterleaveGroup<VPInstruction> *>;
+
+ /// Recursively \p Region and populate VPlan based interleave groups based on
+ /// \p IAI.
+ void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI);
+ /// Recursively traverse \p Block and populate VPlan based interleave groups
+ /// based on \p IAI.
+ void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI);
+
+public:
+ VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
+
+ ~VPInterleavedAccessInfo() {
+ SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
+ // Avoid releasing a pointer twice.
+ for (auto &I : InterleaveGroupMap)
+ DelSet.insert(I.second);
+ for (auto *Ptr : DelSet)
+ delete Ptr;
+ }
+
+ /// Get the interleave group that \p Instr belongs to.
+ ///
+ /// \returns nullptr if doesn't have such group.
+ InterleaveGroup<VPInstruction> *
+ getInterleaveGroup(VPInstruction *Instr) const {
+ if (InterleaveGroupMap.count(Instr))
+ return InterleaveGroupMap.find(Instr)->second;
+ return nullptr;
+ }
+};
+
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+private:
+ enum class OpMode { Failed, Load, Opcode };
+
+ /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+ /// DenseMap keys.
+ struct BundleDenseMapInfo {
+ static SmallVector<VPValue *, 4> getEmptyKey() {
+ return {reinterpret_cast<VPValue *>(-1)};
+ }
+
+ static SmallVector<VPValue *, 4> getTombstoneKey() {
+ return {reinterpret_cast<VPValue *>(-2)};
+ }
+
+ static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+ const SmallVector<VPValue *, 4> &RHS) {
+ return LHS == RHS;
+ }
+ };
+
+ /// Mapping of values in the original VPlan to a combined VPInstruction.
+ DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+ BundleToCombined;
+
+ VPInterleavedAccessInfo &IAI;
+
+ /// Basic block to operate on. For now, only instructions in a single BB are
+ /// considered.
+ const VPBasicBlock &BB;
+
+ /// Indicates whether we managed to combine all visited instructions or not.
+ bool CompletelySLP = true;
+
+ /// Width of the widest combined bundle in bits.
+ unsigned WidestBundleBits = 0;
+
+ using MultiNodeOpTy =
+ typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+ // Input operand bundles for the current multi node. Each multi node operand
+ // bundle contains values not matching the multi node's opcode. They will
+ // be reordered in reorderMultiNodeOps, once we completed building a
+ // multi node.
+ SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+ /// Indicates whether we are building a multi node currently.
+ bool MultiNodeActive = false;
+
+ /// Check if we can vectorize Operands together.
+ bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+ /// Add combined instruction \p New for the bundle \p Operands.
+ void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+ /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+ VPInstruction *markFailed();
+
+ /// Reorder operands in the multi node to maximize sequential memory access
+ /// and commutative operations.
+ SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+ /// Choose the best candidate to use for the lane after \p Last. The set of
+ /// candidates to choose from are values with an opcode matching \p Last's
+ /// or loads consecutive to \p Last.
+ std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+ SmallPtrSetImpl<VPValue *> &Candidates,
+ VPInterleavedAccessInfo &IAI);
+
+ /// Print bundle \p Values to dbgs().
+ void dumpBundle(ArrayRef<VPValue *> Values);
+
+public:
+ VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
+ ~VPlanSlp() {
+ for (auto &KV : BundleToCombined)
+ delete KV.second;
+ }
+
+ /// Tries to build an SLP tree rooted at \p Operands and returns a
+ /// VPInstruction combining \p Operands, if they can be combined.
+ VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+ /// Return the width of the widest combined bundle in bits.
+ unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+ /// Return true if all visited instruction can be combined.
+ bool isCompletelySLP() const { return CompletelySLP; }
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
new file mode 100644
index 000000000000..19f5d2c00c60
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -0,0 +1,40 @@
+//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements dominator tree analysis for a single level of a VPlan's
+/// H-CFG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+
+#include "VPlan.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/IR/Dominators.h"
+
+namespace llvm {
+
+/// Template specialization of the standard LLVM dominator tree utility for
+/// VPBlockBases.
+using VPDominatorTree = DomTreeBase<VPBlockBase>;
+
+using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
+
+/// Template specializations of GraphTraits for VPDomTreeNode.
+template <>
+struct GraphTraits<VPDomTreeNode *>
+ : public DomTreeGraphTraitsBase<VPDomTreeNode, VPDomTreeNode::iterator> {};
+
+template <>
+struct GraphTraits<const VPDomTreeNode *>
+ : public DomTreeGraphTraitsBase<const VPDomTreeNode,
+ VPDomTreeNode::const_iterator> {};
+} // namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
new file mode 100644
index 000000000000..df96f67288f1
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -0,0 +1,354 @@
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // Vectorization plan that we are working on.
+ VPlan &Plan;
+
+ // Output Top Region.
+ VPRegionBlock *TopRegion = nullptr;
+
+ // Builder of the VPlan instruction-level representation.
+ VPBuilder VPIRBuilder;
+
+ // NOTE: The following maps are intentionally destroyed after the plain CFG
+ // construction because subsequent VPlan-to-VPlan transformation may
+ // invalidate them.
+ // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+ DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+ // Map incoming Value definitions to their newly-created VPValues.
+ DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+ // Hold phi node's that need to be fixed once the plain CFG has been built.
+ SmallVector<PHINode *, 8> PhisToFix;
+
+ // Utility functions.
+ void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+ void fixPhiNodes();
+ VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+#ifndef NDEBUG
+ bool isExternalDef(Value *Val);
+#endif
+ VPValue *getOrCreateVPOperand(Value *IRVal);
+ void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+ PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ // Build the plain CFG and return its Top Region.
+ VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+ SmallVector<VPBlockBase *, 8> VPBBPreds;
+ // Collect VPBB predecessors.
+ for (BasicBlock *Pred : predecessors(BB))
+ VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+ VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+ for (auto *Phi : PhisToFix) {
+ assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+ VPValue *VPVal = IRDef2VPValue[Phi];
+ assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+ auto *VPPhi = cast<VPInstruction>(VPVal);
+ assert(VPPhi->getNumOperands() == 0 &&
+ "Expected VPInstruction with no operands.");
+
+ for (Value *Op : Phi->operands())
+ VPPhi->addOperand(getOrCreateVPOperand(Op));
+ }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+ auto BlockIt = BB2VPBB.find(BB);
+ if (BlockIt != BB2VPBB.end())
+ // Retrieve existing VPBB.
+ return BlockIt->second;
+
+ // Create new VPBB.
+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+ VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+ BB2VPBB[BB] = VPBB;
+ VPBB->setParent(TopRegion);
+ return VPBB;
+}
+
+#ifndef NDEBUG
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+ // All the Values that are not Instructions are considered external
+ // definitions for now.
+ Instruction *Inst = dyn_cast<Instruction>(Val);
+ if (!Inst)
+ return true;
+
+ BasicBlock *InstParent = Inst->getParent();
+ assert(InstParent && "Expected instruction parent.");
+
+ // Check whether Instruction definition is in loop PH.
+ BasicBlock *PH = TheLoop->getLoopPreheader();
+ assert(PH && "Expected loop pre-header.");
+
+ if (InstParent == PH)
+ // Instruction definition is in outermost loop PH.
+ return false;
+
+ // Check whether Instruction definition is in the loop exit.
+ BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+ assert(Exit && "Expected loop with single exit.");
+ if (InstParent == Exit) {
+ // Instruction definition is in outermost loop exit.
+ return false;
+ }
+
+ // Check whether Instruction definition is in loop body.
+ return !TheLoop->contains(Inst);
+}
+#endif
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+ auto VPValIt = IRDef2VPValue.find(IRVal);
+ if (VPValIt != IRDef2VPValue.end())
+ // Operand has an associated VPInstruction or VPValue that was previously
+ // created.
+ return VPValIt->second;
+
+ // Operand doesn't have a previously created VPInstruction/VPValue. This
+ // means that operand is:
+ // A) a definition external to VPlan,
+ // B) any other Value without specific representation in VPlan.
+ // For now, we use VPValue to represent A and B and classify both as external
+ // definitions. We may introduce specific VPValue subclasses for them in the
+ // future.
+ assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+ // A and B: Create VPValue and add it to the pool of external definitions and
+ // to the Value->VPValue map.
+ VPValue *NewVPVal = new VPValue(IRVal);
+ Plan.addExternalDef(NewVPVal);
+ IRDef2VPValue[IRVal] = NewVPVal;
+ return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+ BasicBlock *BB) {
+ VPIRBuilder.setInsertPoint(VPBB);
+ for (Instruction &InstRef : *BB) {
+ Instruction *Inst = &InstRef;
+
+ // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+ // visited Inst when we shouldn't, breaking the RPO traversal order.
+ assert(!IRDef2VPValue.count(Inst) &&
+ "Instruction shouldn't have been visited.");
+
+ if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+ // Branch instruction is not explicitly represented in VPlan but we need
+ // to represent its condition bit when it's conditional.
+ if (Br->isConditional())
+ getOrCreateVPOperand(Br->getCondition());
+
+ // Skip the rest of the Instruction processing for Branch instructions.
+ continue;
+ }
+
+ VPInstruction *NewVPInst;
+ if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+ // Phi node's operands may have not been visited at this point. We create
+ // an empty VPInstruction that we will fix once the whole plain CFG has
+ // been built.
+ NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+ Inst->getOpcode(), {} /*No operands*/, Inst));
+ PhisToFix.push_back(Phi);
+ } else {
+ // Translate LLVM-IR operands into VPValue operands and set them in the
+ // new VPInstruction.
+ SmallVector<VPValue *, 4> VPOperands;
+ for (Value *Op : Inst->operands())
+ VPOperands.push_back(getOrCreateVPOperand(Op));
+
+ // Build VPInstruction for any arbitraty Instruction without specific
+ // representation in VPlan.
+ NewVPInst = cast<VPInstruction>(
+ VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+ }
+
+ IRDef2VPValue[Inst] = NewVPInst;
+ }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+ // 1. Create the Top Region. It will be the parent of all VPBBs.
+ TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+ // 2. Scan the body of the loop in a topological order to visit each basic
+ // block after having visited its predecessor basic blocks. Create a VPBB for
+ // each BB and link it to its successor and predecessor VPBBs. Note that
+ // predecessors must be set in the same order as they are in the incomming IR.
+ // Otherwise, there might be problems with existing phi nodes and algorithm
+ // based on predecessors traversal.
+
+ // Loop PH needs to be explicitly visited since it's not taken into account by
+ // LoopBlocksDFS.
+ BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+ assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ "Unexpected loop preheader");
+ VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+ createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+ // Create empty VPBB for Loop H so that we can link PH->H.
+ VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+ // Preheader's predecessors will be set during the loop RPO traversal below.
+ PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+ LoopBlocksRPO RPO(TheLoop);
+ RPO.perform(LI);
+
+ for (BasicBlock *BB : RPO) {
+ // Create or retrieve the VPBasicBlock for this BB and create its
+ // VPInstructions.
+ VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+ createVPInstructionsForVPBB(VPBB, BB);
+
+ // Set VPBB successors. We create empty VPBBs for successors if they don't
+ // exist already. Recipes will be created when the successor is visited
+ // during the RPO traversal.
+ Instruction *TI = BB->getTerminator();
+ assert(TI && "Terminator expected.");
+ unsigned NumSuccs = TI->getNumSuccessors();
+
+ if (NumSuccs == 1) {
+ VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB && "VPBB Successor not found.");
+ VPBB->setOneSuccessor(SuccVPBB);
+ } else if (NumSuccs == 2) {
+ VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB0 && "Successor 0 not found.");
+ VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+ assert(SuccVPBB1 && "Successor 1 not found.");
+
+ // Get VPBB's condition bit.
+ assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+ auto *Br = cast<BranchInst>(TI);
+ Value *BrCond = Br->getCondition();
+ // Look up the branch condition to get the corresponding VPValue
+ // representing the condition bit in VPlan (which may be in another VPBB).
+ assert(IRDef2VPValue.count(BrCond) &&
+ "Missing condition bit in IRDef2VPValue!");
+ VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+ // Link successors using condition bit.
+ VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+ } else
+ llvm_unreachable("Number of successors not supported.");
+
+ // Set VPBB predecessors in the same order as they are in the incoming BB.
+ setVPBBPredsFromBB(VPBB, BB);
+ }
+
+ // 3. Process outermost loop exit. We created an empty VPBB for the loop
+ // single exit BB during the RPO traversal of the loop body but Instructions
+ // weren't visited because it's not part of the the loop.
+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+ assert(LoopExitBB && "Loops with multiple exits are not supported.");
+ VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+ createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+ // Loop exit was already set as successor of the loop exiting BB.
+ // We only set its predecessor VPBB now.
+ setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+ // 4. The whole CFG has been built at this point so all the input Values must
+ // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+ // VPlan operands.
+ fixPhiNodes();
+
+ // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+ // Top Region entry and exit.
+ TopRegion->setEntry(PreheaderVPBB);
+ TopRegion->setExit(LoopExitVPBB);
+ return TopRegion;
+}
+
+VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
+ PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+ return PCFGBuilder.buildPlainCFG();
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG() {
+ // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+ VPRegionBlock *TopRegion = buildPlainCFG();
+ Plan.setEntry(TopRegion);
+ LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+ Verifier.verifyHierarchicalCFG(TopRegion);
+
+ // Compute plain CFG dom tree for VPLInfo.
+ VPDomTree.recalculate(*TopRegion);
+ LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
+ VPDomTree.print(dbgs()));
+
+ // Compute VPLInfo and keep it in Plan.
+ VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
+ VPLInfo.analyze(VPDomTree);
+ LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
+ VPLInfo.print(dbgs()));
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
new file mode 100644
index 000000000000..238ee7e6347c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -0,0 +1,71 @@
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+class VPlanTestBase;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+ friend VPlanTestBase;
+
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // The VPlan that will contain the H-CFG we are building.
+ VPlan &Plan;
+
+ // VPlan verifier utility.
+ VPlanVerifier Verifier;
+
+ // Dominator analysis for VPlan plain CFG to be used in the
+ // construction of the H-CFG. This analysis is no longer valid once regions
+ // are introduced.
+ VPDominatorTree VPDomTree;
+
+ /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
+ /// enclosing the plain CFG.
+ VPRegionBlock *buildPlainCFG();
+
+public:
+ VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ /// Build H-CFG for TheLoop and update Plan accordingly.
+ void buildHierarchicalCFG();
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
new file mode 100644
index 000000000000..b22d3190d654
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
@@ -0,0 +1,84 @@
+//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+ auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+
+ // Condition bit VPValues get deleted during transformation to VPRecipes.
+ // Create new VPValues and save away as condition bits. These will be deleted
+ // after finalizing the vector IR basic blocks.
+ for (VPBlockBase *Base : RPOT) {
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ if (auto *CondBit = VPBB->getCondBit()) {
+ auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
+ VPBB->setCondBit(NCondBit);
+ Plan->addCBV(NCondBit);
+ }
+ }
+ for (VPBlockBase *Base : RPOT) {
+ // Do not widen instructions in pre-header and exit blocks.
+ if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+ continue;
+
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ VPRecipeBase *LastRecipe = nullptr;
+ // Introduce each ingredient into VPlan.
+ for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+ VPRecipeBase *Ingredient = &*I++;
+ // Can only handle VPInstructions.
+ VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+ Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ if (DeadInstructions.count(Inst)) {
+ Ingredient->eraseFromParent();
+ continue;
+ }
+
+ VPRecipeBase *NewRecipe = nullptr;
+ // Create VPWidenMemoryInstructionRecipe for loads and stores.
+ if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);
+ else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+ InductionDescriptor II = Inductions->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+ } else
+ NewRecipe = new VPWidenPHIRecipe(Phi);
+ } else {
+ // If the last recipe is a VPWidenRecipe, add Inst to it instead of
+ // creating a new recipe.
+ if (VPWidenRecipe *WidenRecipe =
+ dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
+ WidenRecipe->appendInstruction(Inst);
+ Ingredient->eraseFromParent();
+ continue;
+ }
+ NewRecipe = new VPWidenRecipe(Inst);
+ }
+
+ NewRecipe->insertBefore(Ingredient);
+ LastRecipe = NewRecipe;
+ Ingredient->eraseFromParent();
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
new file mode 100644
index 000000000000..79a23c33184f
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
@@ -0,0 +1,35 @@
+//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+class VPlanHCFGTransforms {
+
+public:
+ /// Replaces the VPInstructions in \p Plan with corresponding
+ /// widen recipes.
+ static void VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h b/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
new file mode 100644
index 000000000000..5208f2d58e2b
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
@@ -0,0 +1,44 @@
+//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
+/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
+/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
+/// information can be found in VectorizationPlanner.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+
+#include "llvm/Analysis/LoopInfoImpl.h"
+
+namespace llvm {
+class VPBlockBase;
+
+/// Hold analysis information for every loop detected by VPLoopInfo. It is an
+/// instantiation of LoopBase.
+class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
+private:
+ friend class LoopInfoBase<VPBlockBase, VPLoop>;
+ explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
+};
+
+/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
+/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
+// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
+// is the same as the incoming IR CFG. If it's more efficient than running the
+// whole loop detection algorithm, we may want to create a mechanism to
+// translate LoopInfo into VPLoopInfo. However, that would require significant
+// changes in LoopInfoBase class.
+typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
new file mode 100644
index 000000000000..7a80f3ff80a5
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -0,0 +1,248 @@
+//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanPredicator.h"
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "VPlanPredicator"
+
+using namespace llvm;
+
+// Generate VPInstructions at the beginning of CurrBB that calculate the
+// predicate being propagated from PredBB to CurrBB depending on the edge type
+// between them. For example if:
+// i. PredBB is controlled by predicate %BP, and
+// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
+// bit value %CBV then this function will generate the following two
+// VPInstructions at the start of CurrBB:
+// %IntermediateVal = not %CBV
+// %FinalVal = and %BP %IntermediateVal
+// It returns %FinalVal.
+VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
+ VPBasicBlock *CurrBB) {
+ VPValue *CBV = PredBB->getCondBit();
+
+ // Set the intermediate value - this is either 'CBV', or 'not CBV'
+ // depending on the edge type.
+ EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
+ VPValue *IntermediateVal = nullptr;
+ switch (ET) {
+ case EdgeType::TRUE_EDGE:
+ // CurrBB is the true successor of PredBB - nothing to do here.
+ IntermediateVal = CBV;
+ break;
+
+ case EdgeType::FALSE_EDGE:
+ // CurrBB is the False successor of PredBB - compute not of CBV.
+ IntermediateVal = Builder.createNot(CBV);
+ break;
+ }
+
+ // Now AND intermediate value with PredBB's block predicate if it has one.
+ VPValue *BP = PredBB->getPredicate();
+ if (BP)
+ return Builder.createAnd(BP, IntermediateVal);
+ else
+ return IntermediateVal;
+}
+
+// Generate a tree of ORs for all IncomingPredicates in WorkList.
+// Note: This function destroys the original Worklist.
+//
+// P1 P2 P3 P4 P5
+// \ / \ / /
+// OR1 OR2 /
+// \ | /
+// \ +/-+
+// \ / |
+// OR3 |
+// \ |
+// OR4 <- Returns this
+// |
+//
+// The algorithm uses a worklist of predicates as its main data structure.
+// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
+// (in this example OR1), and push it back. In this example the worklist
+// contains {P3, P4, P5, OR1}.
+// The process iterates until we have only one element in the Worklist (OR4).
+// The last element is the root predicate which is returned.
+VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
+ if (Worklist.empty())
+ return nullptr;
+
+ // The worklist initially contains all the leaf nodes. Initialize the tree
+ // using them.
+ while (Worklist.size() >= 2) {
+ // Pop a pair of values from the front.
+ VPValue *LHS = Worklist.front();
+ Worklist.pop_front();
+ VPValue *RHS = Worklist.front();
+ Worklist.pop_front();
+
+ // Create an OR of these values.
+ VPValue *Or = Builder.createOr(LHS, RHS);
+
+ // Push OR to the back of the worklist.
+ Worklist.push_back(Or);
+ }
+
+ assert(Worklist.size() == 1 && "Expected 1 item in worklist");
+
+ // The root is the last node in the worklist.
+ VPValue *Root = Worklist.front();
+
+ // This root needs to replace the existing block predicate. This is done in
+ // the caller function.
+ return Root;
+}
+
+// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
+VPlanPredicator::EdgeType
+VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
+ VPBlockBase *ToBlock) {
+ unsigned Count = 0;
+ for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
+ if (SuccBlock == ToBlock) {
+ assert(Count < 2 && "Switch not supported currently");
+ return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
+ }
+ Count++;
+ }
+
+ llvm_unreachable("Broken getEdgeTypeBetween");
+}
+
+// Generate all predicates needed for CurrBlock by going through its immediate
+// predecessor blocks.
+void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
+ VPRegionBlock *Region) {
+ // Blocks that dominate region exit inherit the predicate from the region.
+ // Return after setting the predicate.
+ if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
+ VPValue *RegionBP = Region->getPredicate();
+ CurrBlock->setPredicate(RegionBP);
+ return;
+ }
+
+ // Collect all incoming predicates in a worklist.
+ std::list<VPValue *> IncomingPredicates;
+
+ // Set the builder's insertion point to the top of the current BB
+ VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
+ Builder.setInsertPoint(CurrBB, CurrBB->begin());
+
+ // For each predecessor, generate the VPInstructions required for
+ // computing 'BP AND (not) CBV" at the top of CurrBB.
+ // Collect the outcome of this calculation for all predecessors
+ // into IncomingPredicates.
+ for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
+ // Skip back-edges
+ if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
+ continue;
+
+ VPValue *IncomingPredicate = nullptr;
+ unsigned NumPredSuccsNoBE =
+ VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
+
+ // If there is an unconditional branch to the currBB, then we don't create
+ // edge predicates. We use the predecessor's block predicate instead.
+ if (NumPredSuccsNoBE == 1)
+ IncomingPredicate = PredBlock->getPredicate();
+ else if (NumPredSuccsNoBE == 2) {
+ // Emit recipes into CurrBlock if required
+ assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
+ IncomingPredicate =
+ getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
+ } else
+ llvm_unreachable("FIXME: switch statement ?");
+
+ if (IncomingPredicate)
+ IncomingPredicates.push_back(IncomingPredicate);
+ }
+
+ // Logically OR all incoming predicates by building the Predicate Tree.
+ VPValue *Predicate = genPredicateTree(IncomingPredicates);
+
+ // Now update the block's predicate with the new one.
+ CurrBlock->setPredicate(Predicate);
+}
+
+// Generate all predicates needed for Region.
+void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
+ VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
+
+ // Generate edge predicates and append them to the block predicate. RPO is
+ // necessary since the predecessor blocks' block predicate needs to be set
+ // before the current block's block predicate can be computed.
+ for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) {
+ // TODO: Handle nested regions once we start generating the same.
+ assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
+ createOrPropagatePredicates(Block, Region);
+ }
+}
+
+// Linearize the CFG within Region.
+// TODO: Predication and linearization need RPOT for every region.
+// This traversal is expensive. Since predication is not adding new
+// blocks, we should be able to compute RPOT once in predication and
+// reuse it here. This becomes even more important once we have nested
+// regions.
+void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+ VPBlockBase *PrevBlock = nullptr;
+
+ for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) {
+ // TODO: Handle nested regions once we start generating the same.
+ assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
+
+ // Linearize control flow by adding an unconditional edge between PrevBlock
+ // and CurrBlock skipping loop headers and latches to keep intact loop
+ // header predecessors and loop latch successors.
+ if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
+ !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
+
+ LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
+ << CurrBlock->getName() << "\n");
+
+ PrevBlock->clearSuccessors();
+ CurrBlock->clearPredecessors();
+ VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
+ }
+
+ PrevBlock = CurrBlock;
+ }
+}
+
+// Entry point. The driver function for the predicator.
+void VPlanPredicator::predicate(void) {
+ // Predicate the blocks within Region.
+ predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+
+ // Linearlize the blocks with Region.
+ linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+}
+
+VPlanPredicator::VPlanPredicator(VPlan &Plan)
+ : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
+ // FIXME: Predicator is currently computing the dominator information for the
+ // top region. Once we start storing dominator information in a VPRegionBlock,
+ // we can avoid this recalculation.
+ VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
new file mode 100644
index 000000000000..692afd2978d5
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -0,0 +1,74 @@
+//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+
+namespace llvm {
+
+class VPlanPredicator {
+private:
+ enum class EdgeType {
+ TRUE_EDGE,
+ FALSE_EDGE,
+ };
+
+ // VPlan being predicated.
+ VPlan &Plan;
+
+ // VPLoopInfo for Plan's HCFG.
+ VPLoopInfo *VPLI;
+
+ // Dominator tree for Plan's HCFG.
+ VPDominatorTree VPDomTree;
+
+ // VPlan builder used to generate VPInstructions for block predicates.
+ VPBuilder Builder;
+
+ /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
+ /// \p ToBlock is either the unconditional successor or the conditional true
+ /// successor of \p FromBlock and FALSE_EDGE otherwise.
+ EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
+
+ /// Create and return VPValue corresponding to the predicate for the edge from
+ /// \p PredBB to \p CurrentBlock.
+ VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
+
+ /// Generate and return the result of ORing all the predicate VPValues in \p
+ /// Worklist.
+ VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
+
+ /// Create or propagate predicate for \p CurrBlock in region \p Region using
+ /// predicate(s) of its predecessor(s)
+ void createOrPropagatePredicates(VPBlockBase *CurrBlock,
+ VPRegionBlock *Region);
+
+ /// Predicate the CFG within \p Region.
+ void predicateRegionRec(VPRegionBlock *Region);
+
+ /// Linearize the CFG within \p Region.
+ void linearizeRegionRec(VPRegionBlock *Region);
+
+public:
+ VPlanPredicator(VPlan &Plan);
+
+ /// Predicate Plan's HCFG.
+ void predicate(void);
+};
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
new file mode 100644
index 000000000000..9019ed15ec5f
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -0,0 +1,470 @@
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+/// Look-ahead SLP: auto-vectorization in the presence of commutative
+/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+/// Luís F. W. Góes
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+// Number of levels to look ahead when re-ordering multi node operands.
+static unsigned LookaheadMaxDepth = 5;
+
+VPInstruction *VPlanSlp::markFailed() {
+ // FIXME: Currently this is used to signal we hit instructions we cannot
+ // trivially SLP'ize.
+ CompletelySLP = false;
+ return nullptr;
+}
+
+void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
+ if (all_of(Operands, [](VPValue *V) {
+ return cast<VPInstruction>(V)->getUnderlyingInstr();
+ })) {
+ unsigned BundleSize = 0;
+ for (VPValue *V : Operands) {
+ Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+ assert(!T->isVectorTy() && "Only scalar types supported for now");
+ BundleSize += T->getScalarSizeInBits();
+ }
+ WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+ }
+
+ auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
+ assert(Res.second &&
+ "Already created a combined instruction for the operand bundle");
+ (void)Res;
+}
+
+bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
+ // Currently we only support VPInstructions.
+ if (!all_of(Operands, [](VPValue *Op) {
+ return Op && isa<VPInstruction>(Op) &&
+ cast<VPInstruction>(Op)->getUnderlyingInstr();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
+ return false;
+ }
+
+ // Check if opcodes and type width agree for all instructions in the bundle.
+ // FIXME: Differing widths/opcodes can be handled by inserting additional
+ // instructions.
+ // FIXME: Deal with non-primitive types.
+ const Instruction *OriginalInstr =
+ cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
+ unsigned Opcode = OriginalInstr->getOpcode();
+ unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+ if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+ const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
+ return I->getOpcode() == Opcode &&
+ I->getType()->getPrimitiveSizeInBits() == Width;
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
+ return false;
+ }
+
+ // For now, all operands must be defined in the same BB.
+ if (any_of(Operands, [this](VPValue *Op) {
+ return cast<VPInstruction>(Op)->getParent() != &this->BB;
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
+ return false;
+ }
+
+ if (any_of(Operands,
+ [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
+ return false;
+ }
+
+ // For loads, check that there are no instructions writing to memory in
+ // between them.
+ // TODO: we only have to forbid instructions writing to memory that could
+ // interfere with any of the loads in the bundle
+ if (Opcode == Instruction::Load) {
+ unsigned LoadsSeen = 0;
+ VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+ for (auto &I : *Parent) {
+ auto *VPI = cast<VPInstruction>(&I);
+ if (VPI->getOpcode() == Instruction::Load &&
+ std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+ LoadsSeen++;
+
+ if (LoadsSeen == Operands.size())
+ break;
+ if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
+ LLVM_DEBUG(
+ dbgs() << "VPSLP: instruction modifying memory between loads\n");
+ return false;
+ }
+ }
+
+ if (!all_of(Operands, [](VPValue *Op) {
+ return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+ ->isSimple();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
+ return false;
+ }
+ }
+
+ if (Opcode == Instruction::Store)
+ if (!all_of(Operands, [](VPValue *Op) {
+ return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+ ->isSimple();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
+ unsigned OperandIndex) {
+ SmallVector<VPValue *, 4> Operands;
+ for (VPValue *V : Values) {
+ auto *U = cast<VPUser>(V);
+ Operands.push_back(U->getOperand(OperandIndex));
+ }
+ return Operands;
+}
+
+static bool areCommutative(ArrayRef<VPValue *> Values) {
+ return Instruction::isCommutative(
+ cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(ArrayRef<VPValue *> Values) {
+ SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+ auto *VPI = cast<VPInstruction>(Values[0]);
+
+ switch (VPI->getOpcode()) {
+ case Instruction::Load:
+ llvm_unreachable("Loads terminate a tree, no need to get operands");
+ case Instruction::Store:
+ Result.push_back(getOperands(Values, 0));
+ break;
+ default:
+ for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+ Result.push_back(getOperands(Values, I));
+ break;
+ }
+
+ return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
+ unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+ if (any_of(Values, [Opcode](VPValue *V) {
+ return cast<VPInstruction>(V)->getOpcode() != Opcode;
+ }))
+ return None;
+ return {Opcode};
+}
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+ VPInterleavedAccessInfo &IAI) {
+ if (A->getOpcode() != B->getOpcode())
+ return false;
+
+ if (A->getOpcode() != Instruction::Load &&
+ A->getOpcode() != Instruction::Store)
+ return true;
+ auto *GA = IAI.getInterleaveGroup(A);
+ auto *GB = IAI.getInterleaveGroup(B);
+
+ return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+ VPInterleavedAccessInfo &IAI) {
+ if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+ return 0;
+
+ if (MaxLevel == 0)
+ return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
+ cast<VPInstruction>(V2), IAI);
+
+ unsigned Score = 0;
+ for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
+ for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
+ Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
+ cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
+ return Score;
+}
+
+std::pair<VPlanSlp::OpMode, VPValue *>
+VPlanSlp::getBest(OpMode Mode, VPValue *Last,
+ SmallPtrSetImpl<VPValue *> &Candidates,
+ VPInterleavedAccessInfo &IAI) {
+ assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
+ "Currently we only handle load and commutative opcodes");
+ LLVM_DEBUG(dbgs() << " getBest\n");
+
+ SmallVector<VPValue *, 4> BestCandidates;
+ LLVM_DEBUG(dbgs() << " Candidates for "
+ << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+ for (auto *Candidate : Candidates) {
+ auto *LastI = cast<VPInstruction>(Last);
+ auto *CandidateI = cast<VPInstruction>(Candidate);
+ if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+ LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+ << " ");
+ BestCandidates.push_back(Candidate);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ if (BestCandidates.empty())
+ return {OpMode::Failed, nullptr};
+
+ if (BestCandidates.size() == 1)
+ return {Mode, BestCandidates[0]};
+
+ VPValue *Best = nullptr;
+ unsigned BestScore = 0;
+ for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
+ unsigned PrevScore = ~0u;
+ bool AllSame = true;
+
+ // FIXME: Avoid visiting the same operands multiple times.
+ for (auto *Candidate : BestCandidates) {
+ unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+ if (PrevScore == ~0u)
+ PrevScore = Score;
+ if (PrevScore != Score)
+ AllSame = false;
+ PrevScore = Score;
+
+ if (Score > BestScore) {
+ BestScore = Score;
+ Best = Candidate;
+ }
+ }
+ if (!AllSame)
+ break;
+ }
+ LLVM_DEBUG(dbgs() << "Found best "
+ << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+ << "\n");
+ Candidates.erase(Best);
+
+ return {Mode, Best};
+}
+
+SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
+ SmallVector<MultiNodeOpTy, 4> FinalOrder;
+ SmallVector<OpMode, 4> Mode;
+ FinalOrder.reserve(MultiNodeOps.size());
+ Mode.reserve(MultiNodeOps.size());
+
+ LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+ for (auto &Operands : MultiNodeOps) {
+ FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+ if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+ Instruction::Load)
+ Mode.push_back(OpMode::Load);
+ else
+ Mode.push_back(OpMode::Opcode);
+ }
+
+ for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+ LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
+ SmallPtrSet<VPValue *, 4> Candidates;
+ LLVM_DEBUG(dbgs() << " Candidates ");
+ for (auto Ops : MultiNodeOps) {
+ LLVM_DEBUG(
+ dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+ << " ");
+ Candidates.insert(Ops.second[Lane]);
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+ LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
+ if (Mode[Op] == OpMode::Failed)
+ continue;
+
+ VPValue *Last = FinalOrder[Op].second[Lane - 1];
+ std::pair<OpMode, VPValue *> Res =
+ getBest(Mode[Op], Last, Candidates, IAI);
+ if (Res.second)
+ FinalOrder[Op].second.push_back(Res.second);
+ else
+ // TODO: handle this case
+ FinalOrder[Op].second.push_back(markFailed());
+ }
+ }
+
+ return FinalOrder;
+}
+
+void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
+ dbgs() << " Ops: ";
+ for (auto Op : Values) {
+ if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
+ if (auto *Instr = VPInstr->getUnderlyingInstr()) {
+ dbgs() << *Instr << " | ";
+ continue;
+ }
+ dbgs() << " nullptr | ";
+ }
+ dbgs() << "\n";
+}
+
+VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
+ assert(!Values.empty() && "Need some operands!");
+
+ // If we already visited this instruction bundle, re-use the existing node
+ auto I = BundleToCombined.find(to_vector<4>(Values));
+ if (I != BundleToCombined.end()) {
+#ifndef NDEBUG
+ // Check that the resulting graph is a tree. If we re-use a node, this means
+ // its values have multiple users. We only allow this, if all users of each
+ // value are the same instruction.
+ for (auto *V : Values) {
+ auto UI = V->user_begin();
+ auto *FirstUser = *UI++;
+ while (UI != V->user_end()) {
+ assert(*UI == FirstUser && "Currently we only support SLP trees.");
+ UI++;
+ }
+ }
+#endif
+ return I->second;
+ }
+
+ // Dump inputs
+ LLVM_DEBUG({
+ dbgs() << "buildGraph: ";
+ dumpBundle(Values);
+ });
+
+ if (!areVectorizable(Values))
+ return markFailed();
+
+ assert(getOpcode(Values) && "Opcodes for all values must match");
+ unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+ SmallVector<VPValue *, 4> CombinedOperands;
+ if (areCommutative(Values)) {
+ bool MultiNodeRoot = !MultiNodeActive;
+ MultiNodeActive = true;
+ for (auto &Operands : getOperands(Values)) {
+ LLVM_DEBUG({
+ dbgs() << " Visiting Commutative";
+ dumpBundle(Operands);
+ });
+
+ auto OperandsOpcode = getOpcode(Operands);
+ if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+ LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
+ CombinedOperands.push_back(buildGraph(Operands));
+ } else {
+ LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
+ // Create dummy VPInstruction, which will we replace later by the
+ // re-ordered operand.
+ VPInstruction *Op = new VPInstruction(0, {});
+ CombinedOperands.push_back(Op);
+ MultiNodeOps.emplace_back(Op, Operands);
+ }
+ }
+
+ if (MultiNodeRoot) {
+ LLVM_DEBUG(dbgs() << "Reorder \n");
+ MultiNodeActive = false;
+
+ auto FinalOrder = reorderMultiNodeOps();
+
+ MultiNodeOps.clear();
+ for (auto &Ops : FinalOrder) {
+ VPInstruction *NewOp = buildGraph(Ops.second);
+ Ops.first->replaceAllUsesWith(NewOp);
+ for (unsigned i = 0; i < CombinedOperands.size(); i++)
+ if (CombinedOperands[i] == Ops.first)
+ CombinedOperands[i] = NewOp;
+ delete Ops.first;
+ Ops.first = NewOp;
+ }
+ LLVM_DEBUG(dbgs() << "Found final order\n");
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " NonCommuntative\n");
+ if (ValuesOpcode == Instruction::Load)
+ for (VPValue *V : Values)
+ CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+ else
+ for (auto &Operands : getOperands(Values))
+ CombinedOperands.push_back(buildGraph(Operands));
+ }
+
+ unsigned Opcode;
+ switch (ValuesOpcode) {
+ case Instruction::Load:
+ Opcode = VPInstruction::SLPLoad;
+ break;
+ case Instruction::Store:
+ Opcode = VPInstruction::SLPStore;
+ break;
+ default:
+ Opcode = ValuesOpcode;
+ break;
+ }
+
+ if (!CompletelySLP)
+ return markFailed();
+
+ assert(CombinedOperands.size() > 0 && "Need more some operands");
+ auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+ VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
+ LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
+ cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+ addCombined(Values, VPI);
+ return VPI;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
new file mode 100644
index 000000000000..7b6c228c229e
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -0,0 +1,186 @@
+//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the entities induced by Vectorization
+/// Plans, e.g. the instructions the VPlan intends to generate if executed.
+/// VPlan models the following entities:
+/// VPValue
+/// |-- VPUser
+/// | |-- VPInstruction
+/// These are documented in docs/VectorizationPlan.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+// Forward declarations.
+class VPUser;
+
+// This is the base class of the VPlan Def/Use graph, used for modeling the data
+// flow into, within and out of the VPlan. VPValues can stand for live-ins
+// coming from the input IR, instructions which VPlan will generate if executed
+// and live-outs which the VPlan will need to fix accordingly.
+class VPValue {
+ friend class VPBuilder;
+ friend class VPlanHCFGTransforms;
+ friend class VPBasicBlock;
+ friend class VPInterleavedAccessInfo;
+
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ SmallVector<VPUser *, 1> Users;
+
+protected:
+ // Hold the underlying Value, if any, attached to this VPValue.
+ Value *UnderlyingVal;
+
+ VPValue(const unsigned char SC, Value *UV = nullptr)
+ : SubclassID(SC), UnderlyingVal(UV) {}
+
+ // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+ // the front-end and back-end of VPlan so that the middle-end is as
+ // independent as possible of the underlying IR. We grant access to the
+ // underlying IR using friendship. In that way, we should be able to use VPlan
+ // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+ // back-end and analysis information for the new IR.
+
+ /// Return the underlying Value attached to this VPValue.
+ Value *getUnderlyingValue() { return UnderlyingVal; }
+
+ // Set \p Val as the underlying Value of this VPValue.
+ void setUnderlyingValue(Value *Val) {
+ assert(!UnderlyingVal && "Underlying Value is already set.");
+ UnderlyingVal = Val;
+ }
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPValue that
+ /// are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPValue objects. They are used for concrete
+ /// type identification.
+ enum { VPValueSC, VPUserSC, VPInstructionSC };
+
+ VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
+ VPValue(const VPValue &) = delete;
+ VPValue &operator=(const VPValue &) = delete;
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPValueID() const { return SubclassID; }
+
+ void printAsOperand(raw_ostream &OS) const {
+ OS << "%vp" << (unsigned short)(unsigned long long)this;
+ }
+
+ unsigned getNumUsers() const { return Users.size(); }
+ void addUser(VPUser &User) { Users.push_back(&User); }
+
+ typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
+ typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
+ typedef iterator_range<user_iterator> user_range;
+ typedef iterator_range<const_user_iterator> const_user_range;
+
+ user_iterator user_begin() { return Users.begin(); }
+ const_user_iterator user_begin() const { return Users.begin(); }
+ user_iterator user_end() { return Users.end(); }
+ const_user_iterator user_end() const { return Users.end(); }
+ user_range users() { return user_range(user_begin(), user_end()); }
+ const_user_range users() const {
+ return const_user_range(user_begin(), user_end());
+ }
+
+ /// Returns true if the value has more than one unique user.
+ bool hasMoreThanOneUniqueUser() {
+ if (getNumUsers() == 0)
+ return false;
+
+ // Check if all users match the first user.
+ auto Current = std::next(user_begin());
+ while (Current != user_end() && *user_begin() == *Current)
+ Current++;
+ return Current != user_end();
+ }
+
+ void replaceAllUsesWith(VPValue *New);
+};
+
+typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
+typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
+
+raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
+
+/// This class augments VPValue with operands which provide the inverse def-use
+/// edges from VPValue's users to their defs.
+class VPUser : public VPValue {
+private:
+ SmallVector<VPValue *, 2> Operands;
+
+protected:
+ VPUser(const unsigned char SC) : VPValue(SC) {}
+ VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
+ for (VPValue *Operand : Operands)
+ addOperand(Operand);
+ }
+
+public:
+ VPUser() : VPValue(VPValue::VPUserSC) {}
+ VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
+ VPUser(std::initializer_list<VPValue *> Operands)
+ : VPUser(ArrayRef<VPValue *>(Operands)) {}
+ VPUser(const VPUser &) = delete;
+ VPUser &operator=(const VPUser &) = delete;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() >= VPUserSC &&
+ V->getVPValueID() <= VPInstructionSC;
+ }
+
+ void addOperand(VPValue *Operand) {
+ Operands.push_back(Operand);
+ Operand->addUser(*this);
+ }
+
+ unsigned getNumOperands() const { return Operands.size(); }
+ inline VPValue *getOperand(unsigned N) const {
+ assert(N < Operands.size() && "Operand index out of bounds");
+ return Operands[N];
+ }
+
+ void setOperand(unsigned I, VPValue *New) { Operands[I] = New; }
+
+ typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
+ typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
+ typedef iterator_range<operand_iterator> operand_range;
+ typedef iterator_range<const_operand_iterator> const_operand_range;
+
+ operand_iterator op_begin() { return Operands.begin(); }
+ const_operand_iterator op_begin() const { return Operands.begin(); }
+ operand_iterator op_end() { return Operands.end(); }
+ const_operand_iterator op_end() const { return Operands.end(); }
+ operand_range operands() { return operand_range(op_begin(), op_end()); }
+ const_operand_range operands() const {
+ return const_operand_range(op_begin(), op_end());
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
new file mode 100644
index 000000000000..394b1b93113b
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -0,0 +1,132 @@
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+ cl::Hidden,
+ cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+ SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+ for (const auto *Block : VPBlockVec) {
+ if (VPBlockSet.count(Block))
+ return true;
+ VPBlockSet.insert(Block);
+ }
+ return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ // Check block's parent.
+ assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+ // Check block's condition bit.
+ if (VPB->getNumSuccessors() > 1)
+ assert(VPB->getCondBit() && "Missing condition bit!");
+ else
+ assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+ // Check block's successors.
+ const auto &Successors = VPB->getSuccessors();
+ // There must be only one instance of a successor in block's successor list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Successors) &&
+ "Multiple instances of the same successor.");
+
+ for (const VPBlockBase *Succ : Successors) {
+ // There must be a bi-directional link between block and successor.
+ const auto &SuccPreds = Succ->getPredecessors();
+ assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
+ SuccPreds.end() &&
+ "Missing predecessor link.");
+ (void)SuccPreds;
+ }
+
+ // Check block's predecessors.
+ const auto &Predecessors = VPB->getPredecessors();
+ // There must be only one instance of a predecessor in block's predecessor
+ // list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Predecessors) &&
+ "Multiple instances of the same predecessor.");
+
+ for (const VPBlockBase *Pred : Predecessors) {
+ // Block and predecessor must be inside the same region.
+ assert(Pred->getParent() == VPB->getParent() &&
+ "Predecessor is not in the same region.");
+
+ // There must be a bi-directional link between block and predecessor.
+ const auto &PredSuccs = Pred->getSuccessors();
+ assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
+ PredSuccs.end() &&
+ "Missing successor link.");
+ (void)PredSuccs;
+ }
+ }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+ const VPBlockBase *Entry = Region->getEntry();
+ const VPBlockBase *Exit = Region->getExit();
+
+ // Entry and Exit shouldn't have any predecessor/successor, respectively.
+ assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+ assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+ (void)Entry;
+ (void)Exit;
+
+ verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+ verifyRegion(Region);
+
+ // Recurse inside nested regions.
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+ verifyRegionRec(SubRegion);
+ }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+ const VPRegionBlock *TopRegion) const {
+ if (!EnableHCFGVerifier)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+ assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+ verifyRegionRec(TopRegion);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
new file mode 100644
index 000000000000..7d2b26252172
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -0,0 +1,43 @@
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+/// - Region's entry/exit block must have no predecessors/successors,
+/// respectively.
+/// - Block's parent must be the region immediately containing the block.
+/// - Linked blocks must have a bi-directional link (successor/predecessor).
+/// - All predecessors/successors of a block must belong to the same region.
+/// - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+#include "VPlan.h"
+
+namespace llvm {
+
+/// Class with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+class VPlanVerifier {
+public:
+ /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+ /// verification process comprises the following steps:
+ /// 1. Region/Block verification: Check the Region/Block verification
+ /// invariants for every region in the H-CFG.
+ void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 000000000000..6a4f9169c2af
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,42 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+ initializeLoopVectorizePass(Registry);
+ initializeSLPVectorizerPass(Registry);
+ initializeLoadStoreVectorizerLegacyPassPass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+ initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopVectorizePass());
+}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSLPVectorizerPass());
+}