summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/CMakeLists.txt1
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp84
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp2577
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp2302
-rw-r--r--lib/Transforms/Vectorize/VPlan.cpp557
-rw-r--r--lib/Transforms/Vectorize/VPlan.h1194
-rw-r--r--lib/Transforms/Vectorize/VPlanBuilder.h61
-rw-r--r--lib/Transforms/Vectorize/VPlanValue.h146
-rw-r--r--lib/Transforms/Vectorize/Vectorize.cpp1
9 files changed, 5312 insertions, 1611 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 1aea73cd4a32..7622ed6d194f 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMVectorize
LoopVectorize.cpp
SLPVectorizer.cpp
Vectorize.cpp
+ VPlan.cpp
ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 9cf66382b581..dc83b6d4d292 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1,4 +1,4 @@
-//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===//
+//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,47 +6,67 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
using namespace llvm;
#define DEBUG_TYPE "load-store-vectorizer"
+
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
-namespace {
-
// FIXME: Assuming stack alignment of 4 is always good enough
static const unsigned StackAdjustedAlignment = 4;
-typedef SmallVector<Instruction *, 8> InstrList;
-typedef MapVector<Value *, InstrList> InstrListMap;
+
+namespace {
+
+using InstrList = SmallVector<Instruction *, 8>;
+using InstrListMap = MapVector<Value *, InstrList>;
class Vectorizer {
Function &F;
@@ -163,7 +183,10 @@ public:
AU.setPreservesCFG();
}
};
-}
+
+} // end anonymous namespace
+
+char LoadStoreVectorizer::ID = 0;
INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
"Vectorize load and Store instructions", false, false)
@@ -175,8 +198,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
"Vectorize load and store instructions", false, false)
-char LoadStoreVectorizer::ID = 0;
-
Pass *llvm::createLoadStoreVectorizerPass() {
return new LoadStoreVectorizer();
}
@@ -480,6 +501,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
MemoryInstrs.push_back(&I);
else
ChainInstrs.push_back(&I);
+ } else if (isa<IntrinsicInst>(&I) &&
+ cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+ Intrinsic::sideeffect) {
+ // Ignore llvm.sideeffect calls.
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
break;
@@ -593,7 +618,14 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
// Skip weird non-byte sizes. They probably aren't worth the effort of
// handling correctly.
unsigned TySize = DL.getTypeSizeInBits(Ty);
- if (TySize < 8)
+ if ((TySize % 8) != 0)
+ continue;
+
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
continue;
Value *Ptr = LI->getPointerOperand();
@@ -605,7 +637,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Make sure all the users of a vector are constant-index extracts.
- if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) {
+ if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
@@ -614,7 +646,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
// Save the load locations.
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
LoadRefs[ObjPtr].push_back(LI);
-
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
@@ -627,19 +658,28 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
+
// Skip weird non-byte sizes. They probably aren't worth the effort of
// handling correctly.
unsigned TySize = DL.getTypeSizeInBits(Ty);
- if (TySize < 8)
+ if ((TySize % 8) != 0)
continue;
Value *Ptr = SI->getPointerOperand();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+ // No point in looking at these if they're too big to vectorize.
if (TySize > VecRegSize / 2)
continue;
- if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) {
+ if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
@@ -680,8 +720,8 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
- // Do a quadratic search on all of the given stores and find all of the pairs
- // of stores that follow each other.
+ // Do a quadratic search on all of the given loads/stores and find all of the
+ // pairs of loads/stores that follow each other.
for (int i = 0, e = Instrs.size(); i < e; ++i) {
ConsecutiveChain[i] = -1;
for (int j = e - 1; j >= 0; --j) {
@@ -748,7 +788,7 @@ bool Vectorizer::vectorizeStoreChain(
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
StoreInst *S0 = cast<StoreInst>(Chain[0]);
- // If the vector has an int element, default to int for the whole load.
+ // If the vector has an int element, default to int for the whole store.
Type *StoreTy;
for (Instruction *I : Chain) {
StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 012b10c8a9b0..fbcdc0df0f1c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -47,62 +47,97 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "VPlan.h"
+#include "VPlanBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
-#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
-#include <map>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
#include <tuple>
+#include <utility>
+#include <vector>
using namespace llvm;
-using namespace llvm::PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -245,12 +280,12 @@ createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
namespace {
-// Forward declarations.
-class LoopVectorizeHints;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
+} // end anonymous namespace
+
/// Returns true if the given loop body has a cycle, excluding the loop
/// itself.
static bool hasCyclesInLoopBody(const Loop &L) {
@@ -324,7 +359,6 @@ static unsigned getMemInstAddressSpace(Value *I) {
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
-
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
@@ -349,7 +383,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; }
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
@@ -362,6 +396,8 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
: ConstantFP::get(Ty, C);
}
+namespace llvm {
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -387,16 +423,16 @@ public:
LoopVectorizationCostModel *CM)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
- Builder(PSE.getSE()->getContext()), Induction(nullptr),
- OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
- TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
- AddedSafetyChecks(false) {}
+ Builder(PSE.getSE()->getContext()),
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ virtual ~InnerLoopVectorizer() = default;
/// Create a new empty loop. Unlink the old loop and connect the new one.
- void createVectorizedLoopSkeleton();
+ /// Return the pre-header block of the new loop.
+ BasicBlock *createVectorizedLoopSkeleton();
- /// Vectorize a single instruction within the innermost loop.
- void vectorizeInstruction(Instruction &I);
+ /// Widen a single instruction within the innermost loop.
+ void widenInstruction(Instruction &I);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();
@@ -404,28 +440,83 @@ public:
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
- virtual ~InnerLoopVectorizer() {}
-
-protected:
- /// A small list of PHINodes.
- typedef SmallVector<PHINode *, 4> PhiVector;
-
/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
- typedef SmallVector<Value *, 2> VectorParts;
+ using VectorParts = SmallVector<Value *, 2>;
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+
+ /// A helper function to scalarize a single Instruction in the innermost loop.
+ /// Generates a sequence of scalar instances for each lane between \p MinLane
+ /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+ /// inclusive..
+ void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
+ bool IfPredicateInstr);
+
+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+ /// is provided, the integer induction variable will first be truncated to
+ /// the corresponding type.
+ void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+ /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+ /// vector or scalar value on-demand if one is not yet available. When
+ /// vectorizing a loop, we visit the definition of an instruction before its
+ /// uses. When visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
+ ///
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part. If the value has already been vectorized,
+ /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// a new vector value on-demand by inserting the scalar values into a vector
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into a vector.
+ Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll and vector indices \p Instance. If the value has been
+ /// vectorized but not scalarized, the necessary extractelement instruction
+ /// will be generated.
+ Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+ /// Try to vectorize the interleaved access group that \p Instr belongs to.
+ void vectorizeInterleaveGroup(Instruction *Instr);
+
+ /// Vectorize Load and Store instructions, optionally masking the vector
+ /// operations if \p BlockInMask is non-null.
+ void vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
+
+ /// \brief Set the debug location in the builder using the debug location in
+ /// the instruction.
+ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+protected:
+ friend class LoopVectorizationPlanner;
+
+ /// A small list of PHINodes.
+ using PhiVector = SmallVector<PHINode *, 4>;
/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// vectorization factor.
- typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
-
- // When we if-convert we need to create edge masks. We have to cache values
- // so that we don't end up with exponential recursion/IR.
- typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
- EdgeMaskCacheTy;
- typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
@@ -457,40 +548,14 @@ protected:
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
- /// Predicate conditional instructions that require predication on their
- /// respective conditions.
- void predicateInstructions();
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VectorParts createBlockInMask(BasicBlock *BB);
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
- /// Vectorize a single PHINode in a block. This method handles the induction
- /// variable canonicalization. It supports both VF = 1 for unrolled loops and
- /// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
-
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
- /// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
- /// scalarized instruction behind an if block predicated on the control
- /// dependence of the instruction.
- void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
-
- /// Vectorize Load and Store instructions,
- virtual void vectorizeMemoryInstruction(Instruction *Instr);
-
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -521,11 +586,6 @@ protected:
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Instruction *EntryVal);
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
-
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;
@@ -533,37 +593,19 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
- /// vector or scalar value on-demand if one is not yet available. When
- /// vectorizing a loop, we visit the definition of an instruction before its
- /// uses. When visiting the definition, we either vectorize or scalarize the
- /// instruction, creating an entry for it in the corresponding map. (In some
- /// cases, such as induction variables, we will create both vector and scalar
- /// entries.) Then, as we encounter uses of the definition, we derive values
- /// for each scalar or vector use unless such a value is already available.
- /// For example, if we scalarize a definition and one of its uses is vector,
- /// we build the required vector on-demand with an insertelement sequence
- /// when visiting the use. Otherwise, if the use is scalar, we can use the
- /// existing scalar definition.
- ///
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part. If the value has already been vectorized,
- /// the corresponding vector entry in VectorLoopValueMap is returned. If,
- /// however, the value has a scalar entry in VectorLoopValueMap, we construct
- /// a new vector value on-demand by inserting the scalar values into a vector
- /// with an insertelement sequence. If the value has been neither vectorized
- /// nor scalarized, it must be loop invariant, so we simply broadcast the
- /// value into a vector.
- Value *getOrCreateVectorValue(Value *V, unsigned Part);
-
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part and vector index \p Lane. If the value has
- /// been vectorized but not scalarized, the necessary extractelement
- /// instruction will be generated.
- Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
-
- /// Try to vectorize the interleaved access group that \p Instr belongs to.
- void vectorizeInterleaveGroup(Instruction *Instr);
+ /// If there is a cast involved in the induction variable \p ID, which should
+ /// be ignored in the vectorized loop body, this function records the
+ /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
+ /// cast. We had already proved that the casted Phi is equal to the uncasted
+ /// Phi in the vectorized loop (under a runtime guard), and therefore
+ /// there is no need to vectorize the cast - the same value can be used in the
+ /// vector loop for both the Phi and the cast.
+ /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
+ /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
+ void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -574,12 +616,19 @@ protected:
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(Loop *NewLoop);
+ /// Returns a bitcasted value to the requested vector type.
+ /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
+ Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL);
+
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+
/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+
/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
@@ -601,149 +650,32 @@ protected:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
- /// \brief Set the debug location in the builder using the debug location in
- /// the instruction.
- void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
-
- /// This is a helper class for maintaining vectorization state. It's used for
- /// mapping values from the original loop to their corresponding values in
- /// the new loop. Two mappings are maintained: one for vectorized values and
- /// one for scalarized values. Vectorized values are represented with UF
- /// vector values in the new loop, and scalarized values are represented with
- /// UF x VF scalar values in the new loop. UF and VF are the unroll and
- /// vectorization factors, respectively.
- ///
- /// Entries can be added to either map with setVectorValue and setScalarValue,
- /// which assert that an entry was not already added before. If an entry is to
- /// replace an existing one, call resetVectorValue. This is currently needed
- /// to modify the mapped values during "fix-up" operations that occur once the
- /// first phase of widening is complete. These operations include type
- /// truncation and the second phase of recurrence widening.
- ///
- /// Entries from either map can be retrieved using the getVectorValue and
- /// getScalarValue functions, which assert that the desired value exists.
-
- struct ValueMap {
-
- /// Construct an empty map with the given unroll and vectorization factors.
- ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
-
- /// \return True if the map has any vector entry for \p Key.
- bool hasAnyVectorValue(Value *Key) const {
- return VectorMapStorage.count(Key);
- }
-
- /// \return True if the map has a vector entry for \p Key and \p Part.
- bool hasVectorValue(Value *Key, unsigned Part) const {
- assert(Part < UF && "Queried Vector Part is too large.");
- if (!hasAnyVectorValue(Key))
- return false;
- const VectorParts &Entry = VectorMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
- return Entry[Part] != nullptr;
- }
-
- /// \return True if the map has any scalar entry for \p Key.
- bool hasAnyScalarValue(Value *Key) const {
- return ScalarMapStorage.count(Key);
- }
-
- /// \return True if the map has a scalar entry for \p Key, \p Part and
- /// \p Part.
- bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
- assert(Part < UF && "Queried Scalar Part is too large.");
- assert(Lane < VF && "Queried Scalar Lane is too large.");
- if (!hasAnyScalarValue(Key))
- return false;
- const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
- assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
- return Entry[Part][Lane] != nullptr;
- }
-
- /// Retrieve the existing vector value that corresponds to \p Key and
- /// \p Part.
- Value *getVectorValue(Value *Key, unsigned Part) {
- assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
- return VectorMapStorage[Key][Part];
- }
-
- /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
- /// and \p Lane.
- Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
- assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
- return ScalarMapStorage[Key][Part][Lane];
- }
-
- /// Set a vector value associated with \p Key and \p Part. Assumes such a
- /// value is not already set. If it is, use resetVectorValue() instead.
- void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
- if (!VectorMapStorage.count(Key)) {
- VectorParts Entry(UF);
- VectorMapStorage[Key] = Entry;
- }
- VectorMapStorage[Key][Part] = Vector;
- }
-
- /// Set a scalar value associated with \p Key for \p Part and \p Lane.
- /// Assumes such a value is not already set.
- void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
- Value *Scalar) {
- assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
- if (!ScalarMapStorage.count(Key)) {
- ScalarParts Entry(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part].resize(VF, nullptr);
- // TODO: Consider storing uniform values only per-part, as they occupy
- // lane 0 only, keeping the other VF-1 redundant entries null.
- ScalarMapStorage[Key] = Entry;
- }
- ScalarMapStorage[Key][Part][Lane] = Scalar;
- }
-
- /// Reset the vector value associated with \p Key for the given \p Part.
- /// This function can be used to update values that have already been
- /// vectorized. This is the case for "fix-up" operations including type
- /// truncation and the second phase of recurrence vectorization.
- void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(hasVectorValue(Key, Part) && "Vector value not set for part");
- VectorMapStorage[Key][Part] = Vector;
- }
-
- private:
- /// The unroll factor. Each entry in the vector map contains UF vector
- /// values.
- unsigned UF;
-
- /// The vectorization factor. Each entry in the scalar map contains UF x VF
- /// scalar values.
- unsigned VF;
-
- /// The vector and scalar map storage. We use std::map and not DenseMap
- /// because insertions to DenseMap invalidate its iterators.
- std::map<Value *, VectorParts> VectorMapStorage;
- std::map<Value *, ScalarParts> ScalarMapStorage;
- };
-
/// The original loop.
Loop *OrigLoop;
+
/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// dynamic knowledge to simplify SCEV expressions and converts them to a
/// more usable form.
PredicatedScalarEvolution &PSE;
+
/// Loop Info.
LoopInfo *LI;
+
/// Dominator Tree.
DominatorTree *DT;
+
/// Alias Analysis.
AliasAnalysis *AA;
+
/// Target Library Info.
const TargetLibraryInfo *TLI;
+
/// Target Transform Info.
const TargetTransformInfo *TTI;
+
/// Assumption Cache.
AssumptionCache *AC;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
@@ -758,7 +690,6 @@ protected:
/// vector elements.
unsigned VF;
-protected:
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -770,39 +701,45 @@ protected:
/// The vector-loop preheader.
BasicBlock *LoopVectorPreHeader;
+
/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader;
+
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
+
/// The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;
+
/// The vector loop body.
BasicBlock *LoopVectorBody;
+
/// The scalar loop body.
BasicBlock *LoopScalarBody;
+
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block.
- PHINode *Induction;
+ PHINode *Induction = nullptr;
+
/// The induction variable of the old basic block.
- PHINode *OldInduction;
+ PHINode *OldInduction = nullptr;
/// Maps values from the original loop to their corresponding values in the
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
- ValueMap VectorLoopValueMap;
+ VectorizerValueMap VectorLoopValueMap;
+
+ /// Store instructions that were predicated.
+ SmallVector<Instruction *, 4> PredicatedInstructions;
- /// Store instructions that should be predicated, as a pair
- /// <StoreInst, Predicate>
- SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
/// Trip count of the original loop.
- Value *TripCount;
+ Value *TripCount = nullptr;
+
/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
- Value *VectorTripCount;
+ Value *VectorTripCount = nullptr;
/// The legality analysis.
LoopVectorizationLegality *Legal;
@@ -811,7 +748,7 @@ protected:
LoopVectorizationCostModel *Cost;
// Record whether runtime checks are added.
- bool AddedSafetyChecks;
+ bool AddedSafetyChecks = false;
// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
@@ -831,7 +768,6 @@ public:
UnrollFactor, LVL, CM) {}
private:
- void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
@@ -839,6 +775,8 @@ private:
Value *reverseVector(Value *Vec) override;
};
+} // end namespace llvm
+
/// \brief Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
@@ -861,7 +799,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
const DILocation *DIL = Inst->getDebugLoc();
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst))
B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
else
B.SetCurrentDebugLocation(DIL);
@@ -908,6 +847,8 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
}
}
+namespace llvm {
+
/// \brief The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
@@ -937,7 +878,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
class InterleaveGroup {
public:
InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
- : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
+ : Align(Align), InsertPos(Instr) {
assert(Align && "The alignment should be non-zero");
Factor = std::abs(Stride);
@@ -1010,13 +951,26 @@ public:
Instruction *getInsertPos() const { return InsertPos; }
void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+ /// Add metadata (e.g. alias info) from the instructions in this group to \p
+ /// NewInst.
+ ///
+ /// FIXME: this function currently does not add noalias metadata a'la
+ /// addNewMedata. To do that we need to compute the intersection of the
+ /// noalias info from all members.
+ void addMetadata(Instruction *NewInst) const {
+ SmallVector<Value *, 4> VL;
+ std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
+ [](std::pair<int, Instruction *> p) { return p.second; });
+ propagateMetadata(NewInst, VL);
+ }
+
private:
unsigned Factor; // Interleave Factor.
bool Reverse;
unsigned Align;
DenseMap<int, Instruction *> Members;
- int SmallestKey;
- int LargestKey;
+ int SmallestKey = 0;
+ int LargestKey = 0;
// To avoid breaking dependences, vectorized instructions of an interleave
// group should be inserted at either the first load or the last store in
@@ -1031,6 +985,9 @@ private:
// store i32 %odd // Insert Position
Instruction *InsertPos;
};
+} // end namespace llvm
+
+namespace {
/// \brief Drive the analysis of interleaved memory accesses in the loop.
///
@@ -1044,8 +1001,7 @@ class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
DominatorTree *DT, LoopInfo *LI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
- RequiresScalarEpilogue(false) {}
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
@@ -1065,14 +1021,6 @@ public:
return InterleaveGroupMap.count(Instr);
}
- /// \brief Return the maximum interleave factor of all interleaved groups.
- unsigned getMaxInterleaveFactor() const {
- unsigned MaxFactor = 1;
- for (auto &Entry : InterleaveGroupMap)
- MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
- return MaxFactor;
- }
-
/// \brief Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
@@ -1095,15 +1043,16 @@ private:
/// The interleaved access analysis can also add new predicates (for example
/// by versioning strides of pointers).
PredicatedScalarEvolution &PSE;
+
Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
- const LoopAccessInfo *LAI;
+ const LoopAccessInfo *LAI = nullptr;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
/// out-of-bounds by executing at least one scalar epilogue iteration.
- bool RequiresScalarEpilogue;
+ bool RequiresScalarEpilogue = false;
/// Holds the relationships between the members and the interleave group.
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
@@ -1114,21 +1063,26 @@ private:
/// \brief The descriptor for a strided memory access.
struct StrideDescriptor {
+ StrideDescriptor() = default;
StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
unsigned Align)
: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
- StrideDescriptor() = default;
-
// The access's stride. It is negative for a reverse access.
int64_t Stride = 0;
- const SCEV *Scev = nullptr; // The scalar expression of this access
- uint64_t Size = 0; // The size of the memory object.
- unsigned Align = 0; // The alignment of this access.
+
+ // The scalar expression of this access.
+ const SCEV *Scev = nullptr;
+
+ // The size of the memory object.
+ uint64_t Size = 0;
+
+ // The alignment of this access.
+ unsigned Align = 0;
};
/// \brief A type for holding instructions and their stride descriptors.
- typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+ using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
/// \brief Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
@@ -1179,7 +1133,6 @@ private:
/// not necessary or is prevented because \p A and \p B may be dependent.
bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
StrideEntry *B) const {
-
// Code motion for interleaved accesses can potentially hoist strided loads
// and sink strided stores. The code below checks the legality of the
// following two conditions:
@@ -1244,7 +1197,7 @@ private:
/// for example 'force', means a decision has been made. So, we need to be
/// careful NOT to add them if the user hasn't specifically asked so.
class LoopVectorizeHints {
- enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
+ enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
/// Hint - associates name and validation with the hint value.
struct Hint {
@@ -1263,6 +1216,8 @@ class LoopVectorizeHints {
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
case HK_FORCE:
return (Val <= 1);
+ case HK_ISVECTORIZED:
+ return (Val==0 || Val==1);
}
return false;
}
@@ -1270,16 +1225,21 @@ class LoopVectorizeHints {
/// Vectorization width.
Hint Width;
+
/// Vectorization interleave factor.
Hint Interleave;
+
/// Vectorization forced
Hint Force;
+ /// Already Vectorized
+ Hint IsVectorized;
+
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
/// True if there is any unsafe math in the loop.
- bool PotentiallyUnsafe;
+ bool PotentiallyUnsafe = false;
public:
enum ForceKind {
@@ -1294,7 +1254,7 @@ public:
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
- PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -1302,14 +1262,19 @@ public:
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
+ IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
/// Mark the loop L as already vectorized by setting the width to 1.
void setAlreadyVectorized() {
- Width.Value = Interleave.Value = 1;
- Hint Hints[] = {Width, Interleave};
+ IsVectorized.Value = 1;
+ Hint Hints[] = {IsVectorized};
writeHintsToMetadata(Hints);
}
@@ -1326,19 +1291,19 @@ public:
return false;
}
- if (getWidth() == 1 && getInterleave() == 1) {
- // FIXME: Add a separate metadata to indicate when the loop has already
- // been vectorized instead of setting width and count to 1.
+ if (getIsVectorized() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
// FIXME: Add interleave.disable metadata. This will allow
// vectorize.disable to be used without disabling the pass and errors
// to differentiate between disabled vectorization and a width of 1.
- ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
"AllDisabled", L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: vectorization and interleaving are "
- "explicitly disabled, or vectorize width and interleave "
- "count are both set to 1");
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
return false;
}
@@ -1348,29 +1313,35 @@ public:
/// Dumps all the hint information.
void emitRemarkWithHints() const {
using namespace ore;
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
TheLoop->getStartLoc(),
TheLoop->getHeader())
- << "loop not vectorized: vectorization is explicitly disabled");
- else {
- OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
- TheLoop->getStartLoc(), TheLoop->getHeader());
- R << "loop not vectorized";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=" << NV("Force", true);
- if (Width.Value != 0)
- R << ", Vector Width=" << NV("VectorWidth", Width.Value);
- if (Interleave.Value != 0)
- R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
- R << ")";
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ if (Interleave.Value != 0)
+ R << ", Interleave Count="
+ << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
}
- ORE.emit(R);
- }
+ });
}
unsigned getWidth() const { return Width.Value; }
unsigned getInterleave() const { return Interleave.Value; }
+ unsigned getIsVectorized() const { return IsVectorized.Value; }
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
/// \brief If hints are provided that force vectorization, use the AlwaysPrint
@@ -1454,7 +1425,7 @@ private:
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force};
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
@@ -1489,7 +1460,7 @@ private:
/// Sets current hints into loop metadata, keeping other values intact.
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
- if (HintTypes.size() == 0)
+ if (HintTypes.empty())
return;
// Reserve the first element to LoopID (see below).
@@ -1525,6 +1496,8 @@ private:
OptimizationRemarkEmitter &ORE;
};
+} // end anonymous namespace
+
static void emitMissedWarning(Function *F, Loop *L,
const LoopVectorizeHints &LH,
OptimizationRemarkEmitter *ORE) {
@@ -1546,6 +1519,8 @@ static void emitMissedWarning(Function *F, Loop *L,
}
}
+namespace {
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
@@ -1568,22 +1543,20 @@ public:
std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
LoopVectorizeHints *H)
- : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
- GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
- PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
- Requirements(R), Hints(H) {}
+ : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
+ ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
- typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
+ using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
/// InductionList saves induction variables and maps them to the
/// induction descriptor.
- typedef MapVector<PHINode *, InductionDescriptor> InductionList;
+ using InductionList = MapVector<PHINode *, InductionDescriptor>;
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
- typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
+ using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
@@ -1608,7 +1581,17 @@ public:
/// Returns the widest induction type.
Type *getWidestInductionType() { return WidestIndTy; }
- /// Returns True if V is an induction variable in this loop.
+ /// Returns True if V is a Phi node of an induction variable in this loop.
+ bool isInductionPhi(const Value *V);
+
+ /// Returns True if V is a cast that is part of an induction def-use chain,
+ /// and had been proven to be redundant under a runtime guard (in other
+ /// words, the cast has the same SCEV expression as the induction phi).
+ bool isCastedInductionVariable(const Value *V);
+
+ /// Returns True if V can be considered as an induction variable in this
+ /// loop. V can be the induction phi, or some redundant cast in the def-use
+ /// chain of the inducion phi.
bool isInductionVariable(const Value *V);
/// Returns True if PN is a reduction variable in this loop.
@@ -1629,6 +1612,8 @@ public:
/// 0 - Stride is unknown or non-consecutive.
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
+ /// NOTE: This method must only be used before modifying the original scalar
+ /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
int isConsecutivePtr(Value *Ptr);
/// Returns true if the value V is uniform within the loop.
@@ -1646,11 +1631,6 @@ public:
return InterleaveInfo.isInterleaved(Instr);
}
- /// \brief Return the maximum interleave factor of all interleaved groups.
- unsigned getMaxInterleaveFactor() const {
- return InterleaveInfo.getMaxInterleaveFactor();
- }
-
/// \brief Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
return InterleaveInfo.getInterleaveGroup(Instr);
@@ -1664,6 +1644,10 @@ public:
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
+ uint64_t getMaxSafeRegisterWidth() const {
+ return LAI->getDepChecker().getMaxSafeRegisterWidth();
+ }
+
bool hasStride(Value *V) { return LAI->hasStride(V); }
/// Returns true if the target machine supports masked store operation
@@ -1671,21 +1655,25 @@ public:
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
}
+
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
+
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType) {
return TTI->isLegalMaskedScatter(DataType);
}
+
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType) {
return TTI->isLegalMaskedGather(DataType);
}
+
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
@@ -1701,6 +1689,7 @@ public:
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
unsigned getNumPredStores() const { return NumPredStores; }
@@ -1766,27 +1755,34 @@ private:
return LAI ? &LAI->getSymbolicStrides() : nullptr;
}
- unsigned NumPredStores;
+ unsigned NumPredStores = 0;
/// The loop that we evaluate.
Loop *TheLoop;
+
/// A wrapper around ScalarEvolution used to add runtime SCEV checks.
/// Applies dynamic knowledge to simplify SCEV expressions in the context
/// of existing SCEV assumptions. The analysis will also add a minimal set
/// of new predicates if this is required to enable vectorization and
/// unrolling.
PredicatedScalarEvolution &PSE;
+
/// Target Library Info.
TargetLibraryInfo *TLI;
+
/// Target Transform Info
const TargetTransformInfo *TTI;
+
/// Dominator Tree.
DominatorTree *DT;
+
// LoopAccess analysis.
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
+
// And the loop-accesses info corresponding to this loop. This pointer is
// null until canVectorizeMemory sets it up.
- const LoopAccessInfo *LAI;
+ const LoopAccessInfo *LAI = nullptr;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
@@ -1798,27 +1794,38 @@ private:
/// Holds the primary induction variable. This is the counter of the
/// loop.
- PHINode *PrimaryInduction;
+ PHINode *PrimaryInduction = nullptr;
+
/// Holds the reduction variables.
ReductionList Reductions;
+
/// Holds all of the induction variables that we found in the loop.
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions;
+
+ /// Holds all the casts that participate in the update chain of the induction
+ /// variables, and that have been proven to be redundant (possibly under a
+ /// runtime guard). These casts can be ignored when creating the vectorized
+ /// loop body.
+ SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
+
/// Holds the phi nodes that are first-order recurrences.
RecurrenceSet FirstOrderRecurrences;
+
/// Holds instructions that need to sink past other instructions to handle
/// first-order recurrences.
DenseMap<Instruction *, Instruction *> SinkAfter;
+
/// Holds the widest induction type encountered.
- Type *WidestIndTy;
+ Type *WidestIndTy = nullptr;
/// Allowed outside users. This holds the induction and reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value *, 4> AllowedExit;
/// Can we assume the absence of NaNs.
- bool HasFunNoNaNAttr;
+ bool HasFunNoNaNAttr = false;
/// Vectorization requirements that will go through late-evaluation.
LoopVectorizationRequirements *Requirements;
@@ -1856,9 +1863,13 @@ public:
/// Information about vectorization costs
struct VectorizationFactor {
- unsigned Width; // Vector width with best cost
- unsigned Cost; // Cost of the loop with that width
+ // Vector width with best cost
+ unsigned Width;
+
+ // Cost of the loop with that width
+ unsigned Cost;
};
+
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
@@ -1897,8 +1908,10 @@ public:
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
unsigned LoopInvariantRegs;
+
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers;
+
/// Holds the number of instructions in the loop.
unsigned NumInstructions;
};
@@ -1920,6 +1933,7 @@ public:
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
@@ -1954,7 +1968,8 @@ public:
/// Decision that was taken during cost calculation for memory instruction.
enum InstWidening {
CM_Unknown,
- CM_Widen,
+ CM_Widen, // For consecutive accesses with stride +1.
+ CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
CM_Scalarize
@@ -2010,7 +2025,6 @@ public:
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
-
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
@@ -2030,13 +2044,29 @@ public:
return false;
// If the truncated value is not an induction variable, return false.
- return Legal->isInductionVariable(Op);
+ return Legal->isInductionPhi(Op);
+ }
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
+ /// Collect Uniform and Scalar values for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
+ void collectUniformsAndScalars(unsigned VF) {
+ // Do the analysis once.
+ if (VF == 1 || Uniforms.count(VF))
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ collectLoopScalars(VF);
}
private:
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
- unsigned computeFeasibleMaxVF(bool OptForSize);
+ unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -2045,7 +2075,7 @@ private:
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
- typedef std::pair<unsigned, bool> VectorizationCostTy;
+ using VectorizationCostTy = std::pair<unsigned, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
@@ -2102,7 +2132,7 @@ private:
/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
- typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+ using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
@@ -2134,10 +2164,6 @@ private:
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
unsigned VF);
- /// Collects the instructions to scalarize for each predicated instruction in
- /// the loop.
- void collectInstsToScalarize(unsigned VF);
-
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
@@ -2156,72 +2182,137 @@ private:
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);
- /// Collect Uniform and Scalar values for the given \p VF.
- /// The sets depend on CM decision for Load/Store instructions
- /// that may be vectorized as interleave, gather-scatter or scalarized.
- void collectUniformsAndScalars(unsigned VF) {
- // Do the analysis once.
- if (VF == 1 || Uniforms.count(VF))
- return;
- setCostBasedWideningDecision(VF);
- collectLoopUniforms(VF);
- collectLoopScalars(VF);
- }
-
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
- typedef DenseMap<std::pair<Instruction *, unsigned>,
- std::pair<InstWidening, unsigned>>
- DecisionList;
+ using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
+ std::pair<InstWidening, unsigned>>;
DecisionList WideningDecisions;
public:
/// The loop that we evaluate.
Loop *TheLoop;
+
/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;
+
/// Loop Info analysis.
LoopInfo *LI;
+
/// Vectorization legality.
LoopVectorizationLegality *Legal;
+
/// Vector target information.
const TargetTransformInfo &TTI;
+
/// Target Library Info.
const TargetLibraryInfo *TLI;
+
/// Demanded bits analysis.
DemandedBits *DB;
+
/// Assumption cache.
AssumptionCache *AC;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
const Function *TheFunction;
+
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
+
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
+} // end anonymous namespace
+
+namespace llvm {
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// LoopVectorizationPlanner - drives the vectorization process after having
/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ using VPlanPtr = std::unique_ptr<VPlan>;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
+ return ILV.getOrCreateVectorValue(V, Part);
+ }
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+ unsigned BestVF = 0;
+ unsigned BestUF = 0;
+
public:
- LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM)
- : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
-
- ~LoopVectorizationPlanner() {}
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
/// Plan how to best vectorize, return the best VF and its cost.
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
- /// Generate the IR code for the vectorized loop.
- void executePlan(InnerLoopVectorizer &ILV);
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
protected:
/// Collect the instructions from the original loop that would be trivially
@@ -2229,20 +2320,102 @@ protected:
void collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions);
-private:
- /// The loop that we evaluate.
- Loop *OrigLoop;
+ /// A range of powers-of-2 vectorization factors with fixed start and
+ /// adjustable end. The range includes start and excludes end, e.g.,:
+ /// [1, 9) = {1, 2, 4, 8}
+ struct VFRange {
+ // A power of 2.
+ const unsigned Start;
- /// Loop Info analysis.
- LoopInfo *LI;
+ // Need not be a power of 2. If End <= Start range is empty.
+ unsigned End;
+ };
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+ // Check if \I is a memory instruction to be widened for \p Range.Start and
+ // potentially masked. Such instructions are handled by a recipe that takes an
+ // additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
+ VFRange &Range,
+ VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+ /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+ /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+ /// extended to include \p I or else build a new VPWidenRecipe for it and
+ /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+ /// false otherwise. Range.End may be decreased to ensure same decision from
+ /// \p Range.Start to \p Range.End.
+ bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range,
+ const SmallPtrSetImpl<Value *> &NeedDef);
};
+} // end namespace llvm
+
+namespace {
+
/// \brief This holds vectorization requirements that must be verified late in
/// the process. The requirements are set by legalize and costmodel. Once
/// vectorization has been determined to be possible and profitable the
@@ -2257,8 +2430,7 @@ private:
/// followed by a non-expert user.
class LoopVectorizationRequirements {
public:
- LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
- : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
+ LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
void addUnsafeAlgebraInst(Instruction *I) {
// First unsafe algebra instruction.
@@ -2272,12 +2444,14 @@ public:
const char *PassName = Hints.vectorizeAnalysisPassName();
bool Failed = false;
if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- ORE.emit(
- OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
- UnsafeAlgebraInst->getDebugLoc(),
- UnsafeAlgebraInst->getParent())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "floating-point operations");
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps",
+ UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
Failed = true;
}
@@ -2288,11 +2462,13 @@ public:
NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
if ((ThresholdReached && !Hints.allowReordering()) ||
PragmaThresholdReached) {
- ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations");
+ "memory operations";
+ });
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
@@ -2301,13 +2477,15 @@ public:
}
private:
- unsigned NumRuntimePointerChecks;
- Instruction *UnsafeAlgebraInst;
+ unsigned NumRuntimePointerChecks = 0;
+ Instruction *UnsafeAlgebraInst = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
};
+} // end anonymous namespace
+
static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty()) {
if (!hasCyclesInLoopBody(L))
@@ -2318,11 +2496,15 @@ static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
addAcyclicInnerLoop(*InnerL, V);
}
+namespace {
+
/// The LoopVectorize Pass.
struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;
+ LoopVectorizePass Impl;
+
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
: FunctionPass(ID) {
Impl.DisableUnrolling = NoUnrolling;
@@ -2330,8 +2512,6 @@ struct LoopVectorize : public FunctionPass {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
- LoopVectorizePass Impl;
-
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
@@ -2450,6 +2630,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+ recordVectorLoopValueForInductionCast(II, LastInduction, Part);
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
LastInduction = cast<Instruction>(addFastMathFlag(
@@ -2480,11 +2661,26 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
auto *I = cast<Instruction>(U);
return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
- return any_of(IV->users(), isScalarInst);
+ return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
+ const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
+ unsigned Lane) {
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (Casts.empty())
+ return;
+ // Only the first Cast instruction in the Casts vector is of interest.
+ // The rest of the Casts (if exist) have no uses outside the
+ // induction update chain itself.
+ Instruction *CastInst = *Casts.begin();
+ if (Lane < UINT_MAX)
+ VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+ else
+ VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+}
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -2564,6 +2760,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+ recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
}
@@ -2622,7 +2819,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
@@ -2638,7 +2835,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Value *EntryVal,
const InductionDescriptor &ID) {
-
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
@@ -2663,21 +2859,21 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
-
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
+ : VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
- VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
+ VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
+ recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
}
}
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
-
const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
ValueToValueMap();
@@ -2708,8 +2904,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// instead. If it has been scalarized, and we actually need the value in
// vector form, we will construct the vector values on demand.
if (VectorLoopValueMap.hasAnyScalarValue(V)) {
-
- Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
+ Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
// If we've scalarized a value, that value should be an instruction.
auto *I = cast<Instruction>(V);
@@ -2726,8 +2921,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
- auto *LastInst =
- cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
+ auto *LastInst = cast<Instruction>(
+ VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
@@ -2744,14 +2939,15 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
Value *VectorValue = nullptr;
if (Cost->isUniformAfterVectorization(I, VF)) {
VectorValue = getBroadcastInstrs(ScalarValue);
+ VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
- VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+ // Initialize packing with insertelements to start from undef.
+ Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Undef);
for (unsigned Lane = 0; Lane < VF; ++Lane)
- VectorValue = Builder.CreateInsertElement(
- VectorValue, getOrCreateScalarValue(V, Part, Lane),
- Builder.getInt32(Lane));
+ packScalarIntoVectorValue(V, {Part, Lane});
+ VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
}
- VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
Builder.restoreIP(OldIP);
return VectorValue;
}
@@ -2763,28 +2959,29 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
return B;
}
-Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
- unsigned Lane) {
-
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) {
// If the value is not an instruction contained in the loop, it should
// already be scalar.
if (OrigLoop->isLoopInvariant(V))
return V;
- assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
- : true && "Uniform values only have lane zero");
+ assert(Instance.Lane > 0
+ ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+ : true && "Uniform values only have lane zero");
// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
- if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
- return VectorLoopValueMap.getScalarValue(V, Part, Lane);
+ if (VectorLoopValueMap.hasScalarValue(V, Instance))
+ return VectorLoopValueMap.getScalarValue(V, Instance);
// If the value has not been scalarized, get its entry in VectorLoopValueMap
// for the given unroll part. If this entry is not a vector type (i.e., the
// vectorization factor is one), there is no need to generate an
// extractelement instruction.
- auto *U = getOrCreateVectorValue(V, Part);
+ auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
assert(VF == 1 && "Value not scalarized has non-vector type");
return U;
@@ -2793,7 +2990,20 @@ Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
// Otherwise, the value from the original loop has been vectorized and is
// represented by UF vector values. Extract and return the requested scalar
// value from the appropriate vector lane.
- return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
+ return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+ Value *V, const VPIteration &Instance) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+ Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+ Builder.getInt32(Instance.Lane));
+ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -2843,6 +3053,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Instr != Group->getInsertPos())
return;
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
Value *Ptr = getPointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
@@ -2866,7 +3077,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Index += (VF - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
- Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
+ Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2890,13 +3101,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
-
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
auto *NewLoad = Builder.CreateAlignedLoad(
NewPtrs[Part], Group->getAlignment(), "wide.vec");
- addMetadata(NewLoad, Instr);
+ Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
}
@@ -2917,7 +3127,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
- StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}
if (Group->isReverse())
@@ -2946,9 +3156,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Group->isReverse())
StoredVec = reverseVector(StoredVec);
- // If this member has different type, cast it to an unified type.
+ // If this member has different type, cast it to a unified type.
+
if (StoredVec->getType() != SubVT)
- StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+ StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
StoredVecs.push_back(StoredVec);
}
@@ -2963,11 +3174,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Instruction *NewStoreInstr =
Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
- addMetadata(NewStoreInstr, Instr);
+
+ Group->addMetadata(NewStoreInstr);
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2992,14 +3205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = getMemInstAddressSpace(Instr);
- // Scalarize the memory instruction if necessary.
- if (Decision == LoopVectorizationCostModel::CM_Scalarize)
- return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
-
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = ConsecutiveStride < 0;
+ bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+ bool ConsecutiveStride =
+ Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
bool CreateGatherScatter =
(Decision == LoopVectorizationCostModel::CM_GatherScatter);
@@ -3010,9 +3220,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// Handle consecutive loads/stores.
if (ConsecutiveStride)
- Ptr = getOrCreateScalarValue(Ptr, 0, 0);
+ Ptr = getOrCreateScalarValue(Ptr, {0, 0});
+
+ VectorParts Mask;
+ bool isMaskRequired = BlockInMask;
+ if (isMaskRequired)
+ Mask = *BlockInMask;
- VectorParts Mask = createBlockInMask(Instr->getParent());
// Handle Stores:
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
@@ -3023,7 +3237,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Instruction *NewSI = nullptr;
Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
if (CreateGatherScatter) {
- Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
@@ -3045,13 +3259,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr =
Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- Mask[Part] = reverseVector(Mask[Part]);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr =
Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- if (Legal->isMaskRequired(SI))
+ if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
Mask[Part]);
else
@@ -3068,7 +3283,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
- Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
@@ -3083,12 +3298,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// wide load needs to start at the last vector element.
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- Mask[Part] = reverseVector(Mask[Part]);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr =
Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- if (Legal->isMaskRequired(LI))
+ if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
"wide.masked.load");
@@ -3105,71 +3321,41 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ const VPIteration &Instance,
bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
- DEBUG(dbgs() << "LV: Scalarizing"
- << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
- << '\n');
- // Holds vector parameters or scalars, in case of uniform vals.
- SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- VectorParts Cond;
- if (IfPredicateInstr)
- Cond = createBlockInMask(Instr->getParent());
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
-
- // For each vector unroll 'part':
- for (unsigned Part = 0; Part < UF; ++Part) {
- // For each scalar that we create:
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
- // Start if-block.
- Value *Cmp = nullptr;
- if (IfPredicateInstr) {
- Cmp = Cond[Part];
- if (Cmp->getType()->isVectorTy())
- Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
- Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
- ConstantInt::get(Cmp->getType(), 1));
- }
-
- Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
- Cloned->setName(Instr->getName() + ".cloned");
-
- // Replace the operands of the cloned instructions with their scalar
- // equivalents in the new loop.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
- Cloned->setOperand(op, NewOp);
- }
- addNewMetadata(Cloned, Instr);
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+ Cloned->setOperand(op, NewOp);
+ }
+ addNewMetadata(Cloned, Instr);
- // Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
- // Add the cloned scalar to the scalar map entry.
- VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
+ // Add the cloned scalar to the scalar map entry.
+ VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
- // If we just cloned a new assumption, add it the assumption cache.
- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
+ // If we just cloned a new assumption, add it the assumption cache.
+ if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
- // End if-block.
- if (IfPredicateInstr)
- PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
- }
- }
+ // End if-block.
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(Cloned);
}
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -3281,6 +3467,36 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
return VectorTripCount;
}
+Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL) {
+ // Verify that V is a vector type with same number of elements as DstVTy.
+ unsigned VF = DstVTy->getNumElements();
+ VectorType *SrcVecTy = cast<VectorType>(V->getType());
+ assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ Type *SrcElemTy = SrcVecTy->getElementType();
+ Type *DstElemTy = DstVTy->getElementType();
+ assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+ "Vector elements must have same size");
+
+ // Do a direct cast if element types are castable.
+ if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+ return Builder.CreateBitOrPointerCast(V, DstVTy);
+ }
+ // V cannot be directly casted to desired vector type.
+ // May happen when V is a floating point vector but DstVTy is a vector of
+ // pointers or vice-versa. Handle this using a two-step bitcast using an
+ // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+ assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+ "Only one type should be a pointer type");
+ assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+ "Only one type should be a floating point type");
+ Type *IntTy =
+ IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+ VectorType *VecIntTy = VectorType::get(IntTy, VF);
+ Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+ return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+}
+
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
@@ -3373,7 +3589,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3435,7 +3651,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
// Create and register the new vector loop.
- Loop *Lp = new Loop();
+ Loop *Lp = LI->AllocateLoop();
Loop *ParentLoop = OrigLoop->getParentLoop();
// Insert the new loop into the loop nest and register the new basic blocks
@@ -3554,6 +3770,8 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();
+
+ return LoopVectorPreHeader;
}
// Fix up external users of the induction variable. At this point, we are
@@ -3622,22 +3840,27 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
namespace {
+
struct CSEDenseMapInfo {
static bool canHandle(const Instruction *I) {
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}
+
static inline Instruction *getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
+
static inline Instruction *getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
+
static unsigned getHashValue(const Instruction *I) {
assert(canHandle(I) && "Unknown instruction!");
return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
I->value_op_end()));
}
+
static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
LHS == getTombstoneKey() || RHS == getTombstoneKey())
@@ -3645,7 +3868,8 @@ struct CSEDenseMapInfo {
return LHS->isIdenticalTo(RHS);
}
};
-}
+
+} // end anonymous namespace
///\brief Perform cse of induction variable instructions.
static void cse(BasicBlock *BB) {
@@ -3777,7 +4001,6 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
- //
SmallPtrSet<Value *, 4> Erased;
for (const auto &KV : Cost->getMinimalBitwidths()) {
// If the value wasn't vectorized, we must maintain the original scalar
@@ -3927,7 +4150,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
IVEndValues[Entry.first], LoopMiddleBlock);
fixLCSSAPHIs();
- predicateInstructions();
+ for (Instruction *PI : PredicatedInstructions)
+ sinkScalarOperands(&*PI);
// Remove redundant induction instructions.
cse(LoopVectorBody);
@@ -3953,7 +4177,6 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() {
}
void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
-
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
@@ -4211,7 +4434,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+ Builder.SetInsertPoint(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -4317,7 +4541,6 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
}
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
-
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
auto *VectorLoop = LI->getLoopFor(PredBB);
@@ -4346,7 +4569,6 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// through the worklist doesn't sink a single instruction.
bool Changed;
do {
-
// Add the instructions that need to be reanalyzed to the worklist, and
// reset the changed indicator.
Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
@@ -4365,7 +4587,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// It's legal to sink the instruction if all its uses occur in the
// predicated block. Otherwise, there's nothing to do yet, and we may
// need to reanalyze the instruction.
- if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+ if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
InstsToReanalyze.push_back(I);
continue;
}
@@ -4382,200 +4604,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
-void InnerLoopVectorizer::predicateInstructions() {
-
- // For each instruction I marked for predication on value C, split I into its
- // own basic block to form an if-then construct over C. Since I may be fed by
- // an extractelement instruction or other scalar operand, we try to
- // iteratively sink its scalar operands into the predicated block. If I feeds
- // an insertelement instruction, we try to move this instruction into the
- // predicated block as well. For non-void types, a phi node will be created
- // for the resulting value (either vector or scalar).
- //
- // So for some predicated instruction, e.g. the conditional sdiv in:
- //
- // for.body:
- // ...
- // %add = add nsw i32 %mul, %0
- // %cmp5 = icmp sgt i32 %2, 7
- // br i1 %cmp5, label %if.then, label %if.end
- //
- // if.then:
- // %div = sdiv i32 %0, %1
- // br label %if.end
- //
- // if.end:
- // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
- //
- // the sdiv at this point is scalarized and if-converted using a select.
- // The inactive elements in the vector are not used, but the predicated
- // instruction is still executed for all vector elements, essentially:
- //
- // vector.body:
- // ...
- // %17 = add nsw <2 x i32> %16, %wide.load
- // %29 = extractelement <2 x i32> %wide.load, i32 0
- // %30 = extractelement <2 x i32> %wide.load51, i32 0
- // %31 = sdiv i32 %29, %30
- // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
- // %35 = extractelement <2 x i32> %wide.load, i32 1
- // %36 = extractelement <2 x i32> %wide.load51, i32 1
- // %37 = sdiv i32 %35, %36
- // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
- // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
- //
- // Predication will now re-introduce the original control flow to avoid false
- // side-effects by the sdiv instructions on the inactive elements, yielding
- // (after cleanup):
- //
- // vector.body:
- // ...
- // %5 = add nsw <2 x i32> %4, %wide.load
- // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
- // %9 = extractelement <2 x i1> %8, i32 0
- // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
- //
- // pred.sdiv.if:
- // %10 = extractelement <2 x i32> %wide.load, i32 0
- // %11 = extractelement <2 x i32> %wide.load51, i32 0
- // %12 = sdiv i32 %10, %11
- // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
- // br label %pred.sdiv.continue
- //
- // pred.sdiv.continue:
- // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
- // %15 = extractelement <2 x i1> %8, i32 1
- // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
- //
- // pred.sdiv.if54:
- // %16 = extractelement <2 x i32> %wide.load, i32 1
- // %17 = extractelement <2 x i32> %wide.load51, i32 1
- // %18 = sdiv i32 %16, %17
- // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
- // br label %pred.sdiv.continue55
- //
- // pred.sdiv.continue55:
- // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
- // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
-
- for (auto KV : PredicatedInstructions) {
- BasicBlock::iterator I(KV.first);
- BasicBlock *Head = I->getParent();
- auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
- /*BranchWeights=*/nullptr, DT, LI);
- I->moveBefore(T);
- sinkScalarOperands(&*I);
-
- BasicBlock *PredicatedBlock = I->getParent();
- Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
- PredicatedBlock->setName(BBNamePrefix + ".if");
- PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
-
- // If the instruction is non-void create a Phi node at reconvergence point.
- if (!I->getType()->isVoidTy()) {
- Value *IncomingTrue = nullptr;
- Value *IncomingFalse = nullptr;
-
- if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
- // If the predicated instruction is feeding an insert-element, move it
- // into the Then block; Phi node will be created for the vector.
- InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
- IEI->moveBefore(T);
- IncomingTrue = IEI; // the new vector with the inserted element.
- IncomingFalse = IEI->getOperand(0); // the unmodified vector
- } else {
- // Phi node will be created for the scalar predicated instruction.
- IncomingTrue = &*I;
- IncomingFalse = UndefValue::get(I->getType());
- }
-
- BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
- assert(PostDom && "Then block has multiple successors");
- PHINode *Phi =
- PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
- IncomingTrue->replaceAllUsesWith(Phi);
- Phi->addIncoming(IncomingFalse, Head);
- Phi->addIncoming(IncomingTrue, I->getParent());
- }
- }
-
- DEBUG(DT->verifyDomTree());
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
- if (ECEntryIt != EdgeMaskCache.end())
- return ECEntryIt->second;
-
- VectorParts SrcMask = createBlockInMask(Src);
-
- // The terminator has to be a branch inst!
- BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
- assert(BI && "Unexpected terminator found");
-
- if (BI->isConditional()) {
-
- VectorParts EdgeMask(UF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
- if (BI->getSuccessor(0) != Dst)
- EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
-
- EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
- EdgeMask[Part] = EdgeMaskPart;
- }
-
- EdgeMaskCache[Edge] = EdgeMask;
- return EdgeMask;
- }
-
- EdgeMaskCache[Edge] = SrcMask;
- return SrcMask;
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
-
- VectorParts BlockMask(UF);
-
- // Loop incoming mask is all-one.
- if (OrigLoop->getHeader() == BB) {
- Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = getOrCreateVectorValue(C, Part);
- BlockMaskCache[BB] = BlockMask;
- return BlockMask;
- }
-
- // This is the block mask. We OR all incoming edges, and with zero.
- Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
-
- // For each pred:
- for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
- VectorParts EM = createEdgeMask(*It, BB);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
- }
-
- BlockMaskCache[BB] = BlockMask;
- return BlockMask;
-}
-
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
unsigned VF) {
+ assert(PN->getParent() == OrigLoop->getHeader() &&
+ "Non-header phis should have been handled elsewhere");
+
PHINode *P = cast<PHINode>(PN);
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
@@ -4594,43 +4627,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
}
setDebugLocFromInst(Builder, P);
- // Check for PHI nodes that are lowered to vector selects.
- if (P->getParent() != OrigLoop->getHeader()) {
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = P->getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // ( ...)))
- VectorParts Entry(UF);
- for (unsigned In = 0; In < NumIncoming; In++) {
- VectorParts Cond =
- createEdgeMask(P->getIncomingBlock(In), P->getParent());
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- if (In == 0)
- Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
- else
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
- "predphi");
- }
- }
- for (unsigned Part = 0; Part < UF; ++Part)
- VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
- return;
- }
// This PHINode must be an induction variable.
// Make sure that we know about it.
@@ -4646,7 +4642,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:
- return widenIntOrFpInduction(P);
+ llvm_unreachable("Integer/fp induction is handled elsewhere.");
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4665,7 +4661,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}
}
return;
@@ -4691,25 +4687,11 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
- // Scalarize instructions that should remain scalar after vectorization.
- if (VF > 1 &&
- !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
- shouldScalarizeInstruction(&I)) {
- scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
- return;
- }
-
+void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
- // Nothing to do for PHIs and BR, since we already took care of the
- // loop control flow instructions.
- break;
- case Instruction::PHI: {
- // Vectorize PHINodes.
- widenPHIInstruction(&I, UF, VF);
- break;
- } // End of PHI.
+ case Instruction::PHI:
+ llvm_unreachable("This instruction is handled by a different recipe.");
case Instruction::GetElementPtr: {
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
@@ -4746,7 +4728,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
// values in the vector mapping with initVector, as we do for other
// instructions.
for (unsigned Part = 0; Part < UF; ++Part) {
-
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
auto *Ptr =
@@ -4782,13 +4763,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // Scalarize with predication if this instruction may divide by zero and
- // block execution is conditional, otherwise fallthrough.
- if (Legal->isScalarWithPredication(&I)) {
- scalarizeInstruction(&I, true);
- break;
- }
- LLVM_FALLTHROUGH;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -4836,7 +4810,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
- auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
+ auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
@@ -4862,8 +4836,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
Value *C = nullptr;
if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
@@ -4874,10 +4850,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
break;
}
- case Instruction::Store:
- case Instruction::Load:
- vectorizeMemoryInstruction(&I);
- break;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -4893,16 +4865,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
auto *CI = dyn_cast<CastInst>(&I);
setDebugLocFromInst(Builder, CI);
- // Optimize the special case where the source is a constant integer
- // induction variable. Notice that we can only optimize the 'trunc' case
- // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
- // (c) other casts depend on pointer size.
- if (Cost->isOptimizableIVTruncate(CI, VF)) {
- widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
- cast<TruncInst>(CI));
- break;
- }
-
/// Vectorize casts.
Type *DestTy =
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
@@ -4933,11 +4895,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start)) {
- scalarizeInstruction(&I);
- break;
- }
+
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
@@ -4945,10 +4903,8 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
- if (!UseVectorIntrinsic && NeedToScalarize) {
- scalarizeInstruction(&I);
- break;
- }
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
@@ -4998,9 +4954,9 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
}
default:
- // All other instructions are unsupported. Scalarize them.
- scalarizeInstruction(&I);
- break;
+ // This instruction is not vectorized by simple widening.
+ DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
} // end of switch.
}
@@ -5012,14 +4968,11 @@ void InnerLoopVectorizer::updateAnalysis() {
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
- DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
- LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock,
LI->getLoopFor(LoopVectorBody)->getLoopLatch());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-
DEBUG(DT->verifyDomTree());
}
@@ -5094,12 +5047,15 @@ bool LoopVectorizationLegality::canVectorize() {
// Store the result and return it at the end instead of exiting early, in case
// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (DoExtraAnalysis)
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
if (!TheLoop->getLoopPreheader()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5112,7 +5068,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->empty()) {
ORE->emit(createMissedAnalysis("NotInnermostLoop")
<< "loop is not the innermost loop");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5122,7 +5078,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (TheLoop->getNumBackEdges() != 1) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5132,7 +5088,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getExitingBlock()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5144,7 +5100,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5158,7 +5114,7 @@ bool LoopVectorizationLegality::canVectorize() {
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5167,7 +5123,7 @@ bool LoopVectorizationLegality::canVectorize() {
// Check if we can vectorize the instructions and CFG in this loop.
if (!canVectorizeInstrs()) {
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5176,7 +5132,7 @@ bool LoopVectorizationLegality::canVectorize() {
// Go over each instruction and look at memory deps.
if (!canVectorizeMemory()) {
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5207,7 +5163,7 @@ bool LoopVectorizationLegality::canVectorize() {
<< "Too many SCEV assumptions need to be made and checked "
<< "at runtime");
DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5263,6 +5219,15 @@ void LoopVectorizationLegality::addInductionPhi(
PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit) {
Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
Type *PhiTy = Phi->getType();
const DataLayout &DL = Phi->getModule()->getDataLayout();
@@ -5300,7 +5265,6 @@ void LoopVectorizationLegality::addInductionPhi(
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
- return;
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
@@ -5439,7 +5403,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// operations, shuffles, or casts, as they don't change precision or
// semantics.
} else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.hasUnsafeAlgebra()) {
+ !I.isFast()) {
DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
Hints->setPotentiallyUnsafe();
}
@@ -5451,7 +5415,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
<< "value cannot be used outside the loop");
return false;
}
-
} // next instr.
}
@@ -5474,7 +5437,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
-
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
@@ -5517,7 +5479,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
-
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5532,7 +5493,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// If the use of the pointer will be a scalar use, and all users of the
// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
// place the pointer in PossibleNonScalarPtrs.
- if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+ if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
return isa<LoadInst>(U) || isa<StoreInst>(U);
}))
ScalarPtrs.insert(I);
@@ -5604,7 +5565,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
continue;
auto *Src = cast<Instruction>(Dst->getOperand(0));
- if (all_of(Src->users(), [&](User *U) -> bool {
+ if (llvm::all_of(Src->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
@@ -5631,7 +5592,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// Determine if all users of the induction variable are scalar after
// vectorization.
- auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
});
@@ -5640,10 +5601,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// Determine if all users of the induction variable update instruction are
// scalar after vectorization.
- auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
- });
+ auto ScalarIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ });
if (!ScalarIndUpdate)
continue;
@@ -5703,7 +5665,6 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
}
void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
-
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
@@ -5754,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
"Widening decision should be ready at this moment");
return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
WideningDecision == CM_Interleave);
};
// Iterate over the instructions in the loop, and collect all
@@ -5766,7 +5728,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// the getelementptr won't remain uniform.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
-
// If there's no pointer operand, there's nothing to do.
auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
if (!Ptr)
@@ -5774,9 +5735,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// True if all users of Ptr are memory accesses that have Ptr as their
// pointer operand.
- auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
- return getPointerOperand(U) == Ptr;
- });
+ auto UsersAreMemAccesses =
+ llvm::all_of(Ptr->users(), [&](User *U) -> bool {
+ return getPointerOperand(U) == Ptr;
+ });
// Ensure the memory instruction will not be scalarized or used by
// gather/scatter, making its pointer operand non-uniform. If the pointer
@@ -5812,7 +5774,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (isOutOfScope(OV))
continue;
auto *OI = cast<Instruction>(OV);
- if (all_of(OI->users(), [&](User *U) -> bool {
+ if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
(OI == getPointerOperand(J) && isUniformDecision(J, VF));
@@ -5841,7 +5803,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Determine if all users of the induction variable are uniform after
// vectorization.
- auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
isVectorizedMemAccessUse(I, Ind);
@@ -5851,11 +5813,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Determine if all users of the induction variable update instruction are
// uniform after vectorization.
- auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
- isVectorizedMemAccessUse(I, IndUpdate);
- });
+ auto UniformIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, IndUpdate);
+ });
if (!UniformIndUpdate)
continue;
@@ -5874,9 +5837,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
InterleaveInfo.setLAI(LAI);
const OptimizationRemarkAnalysis *LAR = LAI->getReport();
if (LAR) {
- OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
- "loop not vectorized: ", *LAR);
- ORE->emit(VR);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
}
if (!LAI->canVectorizeMemory())
return false;
@@ -5894,7 +5858,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
return true;
}
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
@@ -5903,6 +5867,15 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
return Inductions.count(PN);
}
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
return FirstOrderRecurrences.count(Phi);
}
@@ -5972,7 +5945,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
void InterleavedAccessInfo::collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides) {
-
auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
// Since it's desired that the load/store instructions be maintained in
@@ -6126,7 +6098,6 @@ void InterleavedAccessInfo::analyzeInterleaving(
// but not with (4). If we did, the dependent access (3) would be within
// the boundaries of the (2, 4) group.
if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
-
// If a dependence exists and A is already in a group, we know that A
// must be a store since A precedes B and WAR dependences are allowed.
// Thus, A would be sunk below B. We release A's group to prevent this
@@ -6205,9 +6176,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Remove interleaved store groups with gaps.
for (InterleaveGroup *Group : StoreGroups)
- if (Group->getNumMembers() != Group->getFactor())
+ if (Group->getNumMembers() != Group->getFactor()) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
+ "to gaps.\n");
releaseGroup(Group);
-
+ }
// Remove interleaved groups with gaps (currently only loads) whose memory
// accesses may wrap around. We have to revisit the getPtrStride analysis,
// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
@@ -6222,9 +6195,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// This means that we can forcefully peel the loop in order to only have to
// check the first pointer for no-wrap. When we'll change to use Assume=true
// we'll only need at most one runtime check per interleaved group.
- //
for (InterleaveGroup *Group : LoadGroups) {
-
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// load would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
@@ -6260,6 +6231,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
// to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "a reverse access with gaps.\n");
releaseGroup(Group);
continue;
}
@@ -6277,8 +6250,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+
+ ORE->emit(
+ createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+ << "runtime pointer checks needed. Not enabled for divergent target");
+
+ return None;
+ }
+
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
- return computeFeasibleMaxVF(OptForSize);
+ return computeFeasibleMaxVF(OptForSize, TC);
if (Legal->getRuntimePointerChecking()->Need) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -6291,7 +6277,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
}
// If we optimize the program for size, avoid creating the tail loop.
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
// If we don't know the precise trip count, don't try to vectorize.
@@ -6303,7 +6288,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
- unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+ unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
if (TC % MaxVF != 0) {
// If the trip count that we found modulo the vectorization factor is not
@@ -6324,46 +6309,52 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return MaxVF;
}
-unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
+unsigned
+LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
+ unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
- unsigned MaxSafeDepDist = -1U;
- // Get the maximum safe dependence distance in bits computed by LAA. If the
- // loop contains any interleaved accesses, we divide the dependence distance
- // by the maximum interleave factor of all interleaved groups. Note that
- // although the division ensures correctness, this is a fairly conservative
- // computation because the maximum distance computed by LAA may not involve
- // any of the interleaved accesses.
- if (Legal->getMaxSafeDepDistBytes() != -1U)
- MaxSafeDepDist =
- Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+
+ WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
- WidestRegister =
- ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
<< WidestType << " bits.\n");
- DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
+ DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
<< " bits.\n");
+ assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
+ return MaxVectorSize;
+ } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+ isPowerOf2_32(ConstTripCount)) {
+ // We need to clamp the VF to be the ConstTripCount. There is no point in
+ // choosing a higher viable VF as done in the loop below.
+ DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
+ MaxVectorSize = ConstTripCount;
+ return MaxVectorSize;
}
- assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
- " into one vector!");
-
unsigned MaxVF = MaxVectorSize;
if (MaximizeBandwidth && !OptForSize) {
- // Collect all viable vectorization factors.
+ // Collect all viable vectorization factors larger than the default MaxVF
+ // (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
- for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+ for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
VFs.push_back(VS);
// For each VF calculate its register usage.
@@ -6485,7 +6476,6 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
unsigned VF,
unsigned LoopCost) {
-
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
@@ -6573,7 +6563,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
- if (VF > 1 && Legal->getReductionVars()->size()) {
+ if (VF > 1 && !Legal->getReductionVars()->empty()) {
DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -6604,7 +6594,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
- if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
+ if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -6623,7 +6613,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
- bool HasReductions = (Legal->getReductionVars()->size() > 0);
+ bool HasReductions = !Legal->getReductionVars()->empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
@@ -6661,7 +6651,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
- typedef DenseMap<Instruction *, unsigned> IntervalMap;
+ using IntervalMap = DenseMap<Instruction *, unsigned>;
+
// Maps instruction to its index.
DenseMap<unsigned, Instruction *> IdxToInstr;
// Marks the end of each interval.
@@ -6700,7 +6691,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
// Saves the list of intervals that end with the index in 'key'.
- typedef SmallVector<Instruction *, 2> InstrList;
+ using InstrList = SmallVector<Instruction *, 2>;
DenseMap<unsigned, InstrList> TransposeEnds;
// Transpose the EndPoints to a list of values that end at each index.
@@ -6795,7 +6786,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
-
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
@@ -6829,7 +6819,6 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
unsigned VF) {
-
assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");
@@ -6844,7 +6833,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Returns true if the given instruction can be scalarized.
auto canBeScalarized = [&](Instruction *I) -> bool {
-
// We only attempt to scalarize instructions forming a single-use chain
// from the original predicated block that would otherwise be vectorized.
// Although not strictly necessary, we give up on instructions we know will
@@ -6947,13 +6935,6 @@ LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;
- // Collect Uniform and Scalar instructions after vectorization with VF.
- collectUniformsAndScalars(VF);
-
- // Collect the instructions (and their associated costs) that will be more
- // profitable to scalarize.
- collectInstsToScalarize(VF);
-
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
@@ -6965,7 +6946,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
continue;
// Skip ignored values.
- if (ValuesToIgnore.count(&I))
+ if (ValuesToIgnore.count(&I) ||
+ (VF > 1 && VecValuesToIgnore.count(&I)))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
@@ -7004,14 +6986,16 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
static const SCEV *getAddressAccessSCEV(
Value *Ptr,
LoopVectorizationLegality *Legal,
- ScalarEvolution *SE,
+ PredicatedScalarEvolution &PSE,
const Loop *TheLoop) {
+
auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
return nullptr;
// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
+ auto SE = PSE.getSE();
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
Value *Opd = Gep->getOperand(i);
@@ -7021,7 +7005,7 @@ static const SCEV *getAddressAccessSCEV(
}
// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
- return SE->getSCEV(Ptr);
+ return PSE.getSCEV(Ptr);
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
@@ -7041,7 +7025,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Figure out whether the access is strided and get the stride value
// if it's known in compile time
- const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
@@ -7145,7 +7129,6 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned VF) {
-
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF == 1) {
@@ -7202,12 +7185,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// We assume that widening is the best solution when possible.
if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
- setWideningDecision(&I, VF, CM_Widen, Cost);
+ int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Expected consecutive stride.");
+ InstWidening Decision =
+ ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ setWideningDecision(&I, VF, Decision, Cost);
continue;
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- unsigned InterleaveCost = UINT_MAX;
+ unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
unsigned NumAccesses = 1;
if (Legal->isAccessInterleaved(&I)) {
auto Group = Legal->getInterleavedAccessGroup(&I);
@@ -7224,7 +7212,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
unsigned GatherScatterCost =
Legal->isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : UINT_MAX;
+ : std::numeric_limits<unsigned>::max();
unsigned ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
@@ -7282,7 +7270,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
for (auto &Op : I->operands())
if (auto *InstOp = dyn_cast<Instruction>(Op))
if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
- AddrDefs.insert(InstOp).second == true)
+ AddrDefs.insert(InstOp).second)
Worklist.push_back(InstOp);
}
@@ -7292,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// by cost functions, but since this involves the task of finding out
// if the loaded register is involved in an address computation, it is
// instead changed here when we know this is the case.
- if (getWideningDecision(I, VF) == CM_Widen)
+ InstWidening Decision = getWideningDecision(I, VF);
+ if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
// Scalarize a widened load of address.
setWideningDecision(I, VF, CM_Scalarize,
(VF * getMemoryInstructionCost(I, 1)));
@@ -7551,7 +7540,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
char LoopVectorize::ID = 0;
+
static const char lv_name[] = "Loop Vectorization";
+
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
@@ -7568,13 +7559,14 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
+
Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
}
-}
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+} // end namespace llvm
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
if (auto *Ptr = getPointerOperand(Inst))
@@ -7593,11 +7585,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+ // Ignore type-casting instructions we identified during induction
+ // detection.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
}
LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
-
// Width 1 means no vectorize, cost 0 means uncomputed cost.
const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
0U};
@@ -7611,11 +7609,26 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
+ buildVPlans(UserVF, UserVF);
+ DEBUG(printPlans(dbgs()));
return {UserVF, 0};
}
unsigned MaxVF = MaybeMaxVF.getValue();
assert(MaxVF != 0 && "MaxVF is zero.");
+
+ for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ CM.collectUniformsAndScalars(VF);
+
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ if (VF > 1)
+ CM.collectInstsToScalarize(VF);
+ }
+
+ buildVPlans(1, MaxVF);
+ DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
return NoVectorization;
@@ -7623,11 +7636,28 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+ DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+ BestVF = VF;
+ BestUF = UF;
+
+ erase_if(VPlans, [VF](const VPlanPtr &Plan) {
+ return !Plan->hasVF(VF);
+ });
+ assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+ DominatorTree *DT) {
// Perform the actual loop transformation.
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
- ILV.createVectorizedLoopSkeleton();
+ VPCallbackILV CallbackILV(ILV);
+
+ VPTransformState State{BestVF, BestUF, LI,
+ DT, ILV.Builder, ILV.VectorLoopValueMap,
+ &ILV, CallbackILV};
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
//===------------------------------------------------===//
//
@@ -7638,36 +7668,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
-
- // Move instructions to handle first-order recurrences.
- DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
- for (auto &Entry : SinkAfter) {
- Entry.first->removeFromParent();
- Entry.first->insertAfter(Entry.second);
- DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
- << " to vectorize a 1st order recurrence.\n");
- }
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- ILV.vectorizeInstruction(I);
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+ VPlans.front()->execute(&State);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7691,18 +7693,23 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
for (auto &Induction : *Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
- }
-}
-
-void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
- auto *SI = dyn_cast<StoreInst>(Instr);
- bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
- return scalarizeInstruction(Instr, IfPredicateInstr);
+ // We record as "Dead" also the type-casting instructions we had identified
+ // during induction analysis. We don't need any handling for them in the
+ // vectorized loop because we have proven that, under a proper runtime
+ // test guarding the vectorized loop, the value of the phi, and the casted
+ // value of the phi, are the same. The last instruction in this casting chain
+ // will get its scalar/vector/widened def from the scalar/vector/widened def
+ // of the respective phi node. Any other casts in the induction def-use chain
+ // have no other uses outside the phi update chain, and will be ignored.
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ DeadInstructions.insert(Casts.begin(), Casts.end());
+ }
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
@@ -7760,6 +7767,722 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
+ const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
+ assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ bool PredicateAtRangeStart = Predicate(Range.Start);
+
+ for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ if (Predicate(TmpVF) != PredicateAtRangeStart) {
+ Range.End = TmpVF;
+ break;
+ }
+
+ return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
+
+ // Collect conditions feeding internal conditional branches; they need to be
+ // represented in VPlan for it to model masking.
+ SmallPtrSet<Value *, 1> NeedDef;
+
+ auto *Latch = OrigLoop->getLoopLatch();
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == Latch)
+ continue;
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Branch && Branch->isConditional())
+ NeedDef.insert(Branch->getCondition());
+ }
+
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(buildVPlan(SubRange, NeedDef));
+ VF = SubRange.End;
+ }
+}
+
+VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
+ BasicBlock *Dst,
+ VPlanPtr &Plan) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ if (ECEntryIt != EdgeMaskCache.end())
+ return ECEntryIt->second;
+
+ VPValue *SrcMask = createBlockInMask(Src, Plan);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ if (!BI->isConditional())
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.createNot(EdgeMask);
+
+ if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
+ EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+
+ return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
+ VPlanPtr &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Look for cached value.
+ BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+ if (BCEntryIt != BlockMaskCache.end())
+ return BCEntryIt->second;
+
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+
+ // Loop incoming mask is all-one.
+ if (OrigLoop->getHeader() == BB)
+ return BlockMaskCache[BB] = BlockMask;
+
+ // This is the block mask. We OR all incoming edges.
+ for (auto *Predecessor : predecessors(BB)) {
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+ if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
+ return BlockMaskCache[BB] = EdgeMask;
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ }
+
+ return BlockMaskCache[BB] = BlockMask;
+}
+
+VPInterleaveRecipe *
+LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range) {
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+ if (!IG)
+ return nullptr;
+
+ // Now check if IG is relevant for VF's in the given range.
+ auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
+ return [=](unsigned VF) -> bool {
+ return (VF >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ };
+ if (!getDecisionAndClampRange(isIGMember(I), Range))
+ return nullptr;
+
+ // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
+ // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
+ // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
+ assert(I == IG->getInsertPos() &&
+ "Generating a recipe for an adjunct member of an interleave group");
+
+ return new VPInterleaveRecipe(IG);
+}
+
+VPWidenMemoryInstructionRecipe *
+LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return nullptr;
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (VF == 1)
+ return false;
+ if (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF))
+ return false;
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point.");
+ assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+ "Interleave memory opportunity should be caught earlier.");
+ return Decision != LoopVectorizationCostModel::CM_Scalarize;
+ };
+
+ if (!getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPWidenMemoryInstructionRecipe(*I, Mask);
+}
+
+VPWidenIntOrFpInductionRecipe *
+LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
+ VFRange &Range) {
+ if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction)
+ return new VPWidenIntOrFpInductionRecipe(Phi);
+
+ return nullptr;
+ }
+
+ // Optimize the special case where the source is a constant integer
+ // induction variable. Notice that we can only optimize the 'trunc' case
+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+ // (c) other casts depend on pointer size.
+
+ // Determine whether \p K is a truncation based on an induction variable that
+ // can be optimized.
+ auto isOptimizableIVTruncate =
+ [&](Instruction *K) -> std::function<bool(unsigned)> {
+ return
+ [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ };
+
+ if (isa<TruncInst>(I) &&
+ getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+ return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
+ cast<TruncInst>(I));
+ return nullptr;
+}
+
+VPBlendRecipe *
+LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi || Phi->getParent() == OrigLoop->getHeader())
+ return nullptr;
+
+ // We know that all PHIs in non-header blocks are converted into selects, so
+ // we don't have to worry about the insertion order and we can just use the
+ // builder. At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ SmallVector<VPValue *, 2> Masks;
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VPValue *EdgeMask =
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ assert((EdgeMask || NumIncoming == 1) &&
+ "Multiple predecessors with one having a full mask");
+ if (EdgeMask)
+ Masks.push_back(EdgeMask);
+ }
+ return new VPBlendRecipe(Phi, Masks);
+}
+
+bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+ VFRange &Range) {
+ if (Legal->isScalarWithPredication(I))
+ return false;
+
+ auto IsVectorizableOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::AShr:
+ case Instruction::BitCast:
+ case Instruction::Br:
+ case Instruction::Call:
+ case Instruction::FAdd:
+ case Instruction::FCmp:
+ case Instruction::FDiv:
+ case Instruction::FMul:
+ case Instruction::FPExt:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::FPTrunc:
+ case Instruction::FRem:
+ case Instruction::FSub:
+ case Instruction::GetElementPtr:
+ case Instruction::ICmp:
+ case Instruction::IntToPtr:
+ case Instruction::Load:
+ case Instruction::LShr:
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::PHI:
+ case Instruction::PtrToInt:
+ case Instruction::SDiv:
+ case Instruction::Select:
+ case Instruction::SExt:
+ case Instruction::Shl:
+ case Instruction::SIToFP:
+ case Instruction::SRem:
+ case Instruction::Store:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::UDiv:
+ case Instruction::UIToFP:
+ case Instruction::URem:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ return true;
+ }
+ return false;
+ };
+
+ if (!IsVectorizableOpcode(I->getOpcode()))
+ return false;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ return false;
+ }
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF)))
+ return false;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+ return UseVectorIntrinsic || !NeedToScalarize;
+ }
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ assert(CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Scalarize &&
+ "Memory widening decisions should have been taken care by now");
+ return false;
+ }
+ return true;
+ };
+
+ if (!getDecisionAndClampRange(willWiden, Range))
+ return false;
+
+ // Success: widen this instruction. We optimize the common case where
+ // consecutive instructions can be represented by a single recipe.
+ if (!VPBB->empty()) {
+ VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
+ if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
+ return true;
+ }
+
+ VPBB->appendRecipe(new VPWidenRecipe(I));
+ return true;
+}
+
+VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan) {
+ bool IsUniform = getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ Range);
+
+ bool IsPredicated = Legal->isScalarWithPredication(I);
+ auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+
+ // Find if I uses a predicated instruction. If so, it will use its scalar
+ // value. Avoid hoisting the insert-element which packs the scalar value into
+ // a vector value, as that happens iff all users use the vector value.
+ for (auto &Op : I->operands())
+ if (auto *PredInst = dyn_cast<Instruction>(Op))
+ if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+ PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+ // Finalize the recipe for Instr, first if it is not predicated.
+ if (!IsPredicated) {
+ DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ VPBB->appendRecipe(Recipe);
+ return VPBB;
+ }
+ DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ assert(VPBB->getSuccessors().empty() &&
+ "VPBB has successors when handling predicated replication.");
+ // Record predicated instructions for above packing optimizations.
+ PredInst2Recipe[I] = Recipe;
+ VPBlockBase *Region =
+ VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
+ return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+}
+
+VPRegionBlock *
+LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
+ // Instructions marked for predication are replicated and placed under an
+ // if-then construct to prevent side-effects.
+
+ // Generate recipes to compute the block mask for this region.
+ VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
+
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *PHIRecipe =
+ Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ Entry->setTwoSuccessors(Pred, Exit);
+ Pred->setOneSuccessor(Exit);
+
+ return Region;
+}
+
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlan(VFRange &Range,
+ const SmallPtrSetImpl<Value *> &NeedDef) {
+ EdgeMaskCache.clear();
+ BlockMaskCache.clear();
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Hold a mapping from predicated instructions to their recipes, in order to
+ // fix their AlsoPack behavior if a user is determined to replicate and use a
+ // scalar instead of vector value.
+ DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+ // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+ auto Plan = llvm::make_unique<VPlan>(VPBB);
+
+ // Represent values that will have defs inside VPlan.
+ for (Value *V : NeedDef)
+ Plan->addVPValue(V);
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ // Relevant instructions from basic block BB will be grouped into VPRecipe
+ // ingredients and fill a new VPBasicBlock.
+ unsigned VPBBsForBB = 0;
+ auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+ VPBB->setOneSuccessor(FirstVPBBForBB);
+ VPBB = FirstVPBBForBB;
+ Builder.setInsertPoint(VPBB);
+
+ std::vector<Instruction *> Ingredients;
+
+ // Organize the ingredients to vectorize from current basic block in the
+ // right order.
+ for (Instruction &I : *BB) {
+ Instruction *Instr = &I;
+
+ // First filter out irrelevant instructions, to ensure no recipes are
+ // built for them.
+ if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
+ DeadInstructions.count(Instr))
+ continue;
+
+ // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
+ // member of the IG, do not construct any Recipe for it.
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+ if (IG && Instr != IG->getInsertPos() &&
+ Range.Start >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(Instr, Range.Start) ==
+ LoopVectorizationCostModel::CM_Interleave) {
+ if (SinkAfterInverse.count(Instr))
+ Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
+ continue;
+ }
+
+ // Move instructions to handle first-order recurrences, step 1: avoid
+ // handling this instruction until after we've handled the instruction it
+ // should follow.
+ auto SAIt = SinkAfter.find(Instr);
+ if (SAIt != SinkAfter.end()) {
+ DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
+ SinkAfterInverse[SAIt->second] = Instr;
+ continue;
+ }
+
+ Ingredients.push_back(Instr);
+
+ // Move instructions to handle first-order recurrences, step 2: push the
+ // instruction to be sunk at its insertion point.
+ auto SAInvIt = SinkAfterInverse.find(Instr);
+ if (SAInvIt != SinkAfterInverse.end())
+ Ingredients.push_back(SAInvIt->second);
+ }
+
+ // Introduce each ingredient into VPlan.
+ for (Instruction *Instr : Ingredients) {
+ VPRecipeBase *Recipe = nullptr;
+
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Check if Instr is a memory operation that should be widened.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+ if ((Recipe = tryToBlend(Instr, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ continue;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if (tryToWiden(Instr, VPBB, Range))
+ continue;
+
+ // Otherwise, if all widening options failed, Instruction is to be
+ // replicated. This may create a successor for VPBB.
+ VPBasicBlock *NextVPBB =
+ handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
+ if (NextVPBB != VPBB) {
+ VPBB = NextVPBB;
+ VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+ : "");
+ }
+ }
+ }
+
+ // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+ // may also be empty, such as the last one VPBB, reflecting original
+ // basic-blocks with no recipes.
+ VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+ assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+ VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+ PreEntry->disconnectSuccessor(Entry);
+ delete PreEntry;
+
+ std::string PlanName;
+ raw_string_ostream RSO(PlanName);
+ unsigned VF = Range.Start;
+ Plan->addVF(VF);
+ RSO << "Initial VPlan for VF={" << VF;
+ for (VF *= 2; VF < Range.End; VF *= 2) {
+ Plan->addVF(VF);
+ RSO << "," << VF;
+ }
+ RSO << "},UF>=1";
+ RSO.flush();
+ Plan->setName(PlanName);
+
+ return Plan;
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << "\\l\"";
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i))
+ O << " +\n"
+ << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
+ for (auto &Instr : make_range(Begin, End))
+ State.ILV->widenInstruction(Instr);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+ State.ILV->widenIntOrFpInduction(IV, Trunc);
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
+ State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+}
+
+void VPBlendRecipe::execute(VPTransformState &State) {
+ State.ILV->setDebugLocFromInst(State.Builder, Phi);
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+
+ assert((User || NumIncoming == 1) &&
+ "Multiple predecessors with predecessors having a full mask");
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ InnerLoopVectorizer::VectorParts Entry(State.UF);
+ for (unsigned In = 0; In < NumIncoming; ++In) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ Value *In0 =
+ State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+ if (In == 0)
+ Entry[Part] = In0; // Initialize with the first incoming value.
+ else {
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Value *Cond = State.get(User->getOperand(In), Part);
+ Entry[Part] =
+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+ }
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+}
+
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Interleave group being replicated.");
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+}
+
+void VPReplicateRecipe::execute(VPTransformState &State) {
+ if (State.Instance) { // Generate a single instance.
+ State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+ // Insert scalar instance packing it into a vector.
+ if (AlsoPack && State.VF > 1) {
+ // If we're constructing lane 0, initialize to start from undef.
+ if (State.Instance->Lane == 0) {
+ Value *Undef =
+ UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+ State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ }
+ State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ }
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts, unless the
+ // instruction is uniform inwhich case generate only the first lane for each
+ // of the UF parts.
+ unsigned EndLane = IsUniform ? 1 : State.VF;
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ for (unsigned Lane = 0; Lane < EndLane; ++Lane)
+ State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane;
+
+ Value *ConditionBit = nullptr;
+ if (!User) // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+ else {
+ VPValue *BlockInMask = User->getOperand(0);
+ ConditionBit = State.get(BlockInMask, Part);
+ if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+ }
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Predicated instruction PHI works per instance.");
+ Instruction *ScalarPredInst = cast<Instruction>(
+ State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+ BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+ assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+ // By current pack/unpack logic we need to generate only a single phi node: if
+ // a vector value for the predicated instruction exists at this point it means
+ // the instruction has vector users only, and a phi for the vector value is
+ // needed. In this case the recipe of the predicated instruction is marked to
+ // also do that packing, thereby "hoisting" the insert-element sequence.
+ // Otherwise, a phi node for the scalar value is needed.
+ unsigned Part = State.Instance->Part;
+ if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+ Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+ PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+ VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+ VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+ State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ } else {
+ Type *PredInstType = PredInst->getType();
+ PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+ Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(ScalarPredInst, PredicatedBB);
+ State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ }
+}
+
+void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
+ if (!User)
+ return State.ILV->vectorizeMemoryInstruction(&Instr);
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
@@ -7878,7 +8601,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7941,48 +8664,61 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
// Do not vectorize or interleaving the loop.
- ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second);
- ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
- ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
} else if (VectorizeLoop && !InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
- ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
} else if (VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ LVP.setBestPlan(VF.Width, IC);
+
using namespace ore;
+
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- LVP.executePlan(Unroller);
+ LVP.executePlan(Unroller, DT);
- ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")");
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LVP.executePlan(LB);
+ LVP.executePlan(LB, DT);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
@@ -7992,11 +8728,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
AddRuntimeUnrollDisableMetaData(L);
// Report the vectorization decision.
- ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+ });
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -8012,7 +8750,6 @@ bool LoopVectorizePass::runImpl(
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE_) {
-
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
@@ -8068,10 +8805,8 @@ bool LoopVectorizePass::runImpl(
// Process each loop nest in the function.
return Changed;
-
}
-
PreservedAnalyses LoopVectorizePass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
@@ -8088,7 +8823,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
bool Changed =
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index dcbcab459a6b..76ba62f5d596 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6,6 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
@@ -15,39 +16,89 @@
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
@@ -156,6 +207,119 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
+/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+/// i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
+/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+ auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ Value *Vec1 = nullptr;
+ Value *Vec2 = nullptr;
+ enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ ShuffleMode CommonShuffleMode = Unknown;
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<ExtractElementInst>(VL[I]);
+ auto *Vec = EI->getVectorOperand();
+ // All vector operands must have the same number of vector elements.
+ if (Vec->getType()->getVectorNumElements() != Size)
+ return None;
+ auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+ if (!Idx)
+ return None;
+ // Undefined behavior if Idx is negative or >= Size.
+ if (Idx->getValue().uge(Size))
+ continue;
+ unsigned IntIdx = Idx->getValue().getZExtValue();
+ // We can extractelement from undef vector.
+ if (isa<UndefValue>(Vec))
+ continue;
+ // For correct shuffling we have to have at most 2 different vector operands
+ // in all extractelement instructions.
+ if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ return None;
+ if (CommonShuffleMode == Permute)
+ continue;
+ // If the extract index is not the same as the operation number, it is a
+ // permutation.
+ if (IntIdx != I) {
+ CommonShuffleMode = Permute;
+ continue;
+ }
+ // Check the shuffle mode for the current operation.
+ if (!Vec1)
+ Vec1 = Vec;
+ else if (Vec != Vec1)
+ Vec2 = Vec;
+ // Example: shufflevector A, B, <0,5,2,7>
+ // I is odd and IntIdx for A == I - FirstAlternate shuffle.
+ // I is even and IntIdx for B == I - FirstAlternate shuffle.
+ // Example: shufflevector A, B, <4,1,6,3>
+ // I is even and IntIdx for A == I - SecondAlternate shuffle.
+ // I is odd and IntIdx for B == I - SecondAlternate shuffle.
+ const bool IIsEven = I & 1;
+ const bool CurrVecIsA = Vec == Vec1;
+ const bool IIsOdd = !IIsEven;
+ const bool CurrVecIsB = !CurrVecIsA;
+ ShuffleMode CurrentShuffleMode =
+ ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
+ : SecondAlternate;
+ // Common mode is not set or the same as the shuffle mode of the current
+ // operation - alternate.
+ if (CommonShuffleMode == Unknown)
+ CommonShuffleMode = CurrentShuffleMode;
+ // Common shuffle mode is not the same as the shuffle mode of the current
+ // operation - permutation.
+ if (CommonShuffleMode != CurrentShuffleMode)
+ CommonShuffleMode = Permute;
+ }
+ // If we're not crossing lanes in different vectors, consider it as blending.
+ if ((CommonShuffleMode == FirstAlternate ||
+ CommonShuffleMode == SecondAlternate) &&
+ Vec2)
+ return TargetTransformInfo::SK_Alternate;
+ // If Vec2 was never used, we have a permutation of a single vector, otherwise
+ // we have permutation of 2 vectors.
+ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
@@ -173,50 +337,107 @@ static unsigned getAltOpcode(unsigned Op) {
}
}
-/// true if the \p Value is odd, false otherwise.
static bool isOdd(unsigned Value) {
return Value & 1;
}
-///\returns bool representing if Opcode \p Op can be part
-/// of an alternate sequence which can later be merged as
-/// a ShuffleVector instruction.
-static bool canCombineAsAltInst(unsigned Op) {
- return Op == Instruction::FAdd || Op == Instruction::FSub ||
- Op == Instruction::Sub || Op == Instruction::Add;
+static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
+ unsigned CheckedOpcode) {
+ return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
}
-/// \returns ShuffleVector instruction if instructions in \p VL have
-/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
-/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
-static unsigned isAltInst(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- unsigned Opcode = I0->getOpcode();
- unsigned AltOpcode = getAltOpcode(Opcode);
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
- return 0;
- }
- return Instruction::ShuffleVector;
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(Value *OpValue, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (!I)
+ return OpValue;
+ auto *OpInst = cast<Instruction>(OpValue);
+ unsigned OpInstOpcode = OpInst->getOpcode();
+ unsigned IOpcode = I->getOpcode();
+ if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
+ return Op;
+ return OpValue;
}
-/// \returns The opcode if all of the Instructions in \p VL have the same
-/// opcode, or zero.
-static unsigned getSameOpcode(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+namespace {
+
+/// Contains data for the instructions going to be vectorized.
+struct RawInstructionsData {
+ /// Main Opcode of the instructions going to be vectorized.
+ unsigned Opcode = 0;
+
+ /// The list of instructions have some instructions with alternate opcodes.
+ bool HasAltOpcodes = false;
+};
+
+} // end anonymous namespace
+
+/// Checks the list of the vectorized instructions \p VL and returns info about
+/// this list.
+static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
+ auto *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return 0;
+ return {};
+ RawInstructionsData Res;
unsigned Opcode = I0->getOpcode();
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || Opcode != I->getOpcode()) {
- if (canCombineAsAltInst(Opcode) && i == 1)
- return isAltInst(VL);
- return 0;
+ // Walk through the list of the vectorized instructions
+ // in order to check its structure described by RawInstructionsData.
+ for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
+ auto *I = dyn_cast<Instruction>(VL[Cnt]);
+ if (!I)
+ return {};
+ if (Opcode != I->getOpcode())
+ Res.HasAltOpcodes = true;
+ }
+ Res.Opcode = Opcode;
+ return Res;
+}
+
+namespace {
+
+/// Main data required for vectorization of instructions.
+struct InstructionsState {
+ /// The very first instruction in the list with the main opcode.
+ Value *OpValue = nullptr;
+
+ /// The main opcode for the list of instructions.
+ unsigned Opcode = 0;
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool IsAltShuffle = false;
+
+ InstructionsState() = default;
+ InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
+ : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+};
+
+} // end anonymous namespace
+
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState, the Opcode that we suppose the whole list
+/// could be vectorized even if its structure is diverse.
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
+ auto Res = getMainOpcode(VL);
+ unsigned Opcode = Res.Opcode;
+ if (!Res.HasAltOpcodes)
+ return InstructionsState(VL[0], Opcode, false);
+ auto *OpInst = cast<Instruction>(VL[0]);
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ // Examine each element in the list instructions VL to determine
+ // if some operations there could be considered as an alternative
+ // (for example as subtraction relates to addition operation).
+ for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+ auto *I = cast<Instruction>(VL[Cnt]);
+ unsigned InstOpcode = I->getOpcode();
+ if ((Res.HasAltOpcodes &&
+ InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
+ (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
+ return InstructionsState(OpInst, 0, false);
}
}
- return Opcode;
+ return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
}
/// \returns true if all of the values in \p VL have the same type or false
@@ -247,7 +468,6 @@ static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
-
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
@@ -292,24 +512,25 @@ static bool isSimple(Instruction *I) {
}
namespace llvm {
+
namespace slpvectorizer {
+
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
- typedef SmallVector<Value *, 8> ValueList;
- typedef SmallVector<Instruction *, 16> InstrList;
- typedef SmallPtrSet<Value *, 16> ValueSet;
- typedef SmallVector<StoreInst *, 8> StoreList;
- typedef MapVector<Value *, SmallVector<Instruction *, 2>>
- ExtraValueToDebugLocsMap;
+ using ValueList = SmallVector<Value *, 8>;
+ using InstrList = SmallVector<Instruction *, 16>;
+ using ValueSet = SmallPtrSet<Value *, 16>;
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using ExtraValueToDebugLocsMap =
+ MapVector<Value *, SmallVector<Instruction *, 2>>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
- : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
- SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
+ DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,6 +552,7 @@ public:
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
+
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
@@ -348,6 +570,7 @@ public:
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
+
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
/// into account (anf updating it, if required) list of externally used
@@ -374,7 +597,7 @@ public:
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence();
+ void optimizeGatherSequence(Function &F);
/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
@@ -416,21 +639,30 @@ public:
private:
struct TreeEntry;
+ /// Checks if all users of \p I are the part of the vectorization tree.
+ bool areAllUsersVectorized(Instruction *I) const;
+
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
+ int OpdNum = 0);
/// \returns True if the ExtractElement/ExtractValue instructions in VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
- bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
- /// Vectorize a single entry in the tree.
- Value *vectorizeTree(TreeEntry *E);
+ /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
+ /// operand corrsponding to this tree entry \p E for the user tree entry
+ /// indicated by \p UserIndx.
+ // In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
+ Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
- /// Vectorize a single entry in the tree, starting in \p VL.
- Value *vectorizeTree(ArrayRef<Value *> VL);
+ /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
+ /// the ordinality of operand corrsponding to the \p VL of scalar values for the
+ /// user indicated by \p UserIndx this \p VL feeds into.
+ Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
/// \returns the pointer to the vectorized value if \p VL is already
/// vectorized, or NULL. They may happen in cycles.
@@ -447,7 +679,7 @@ private:
/// \brief Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -458,18 +690,17 @@ private:
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
- void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
+
/// \reorder commutative operands to get better probability of
/// generating vectorized code.
- void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
struct TreeEntry {
- TreeEntry(std::vector<TreeEntry> &Container)
- : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
- Container(Container) {}
+ TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
@@ -477,14 +708,32 @@ private:
return std::equal(VL.begin(), VL.end(), Scalars.begin());
}
+ /// \returns true if the scalars in VL are found in this tree entry.
+ bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
+ ScalarEvolution &SE) const {
+ assert(VL.size() == Scalars.size() && "Invalid size");
+ SmallVector<Value *, 8> List;
+ if (!sortLoadAccesses(VL, DL, SE, List))
+ return false;
+ return std::equal(List.begin(), List.end(), Scalars.begin());
+ }
+
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
- Value *VectorizedValue;
+ Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
- bool NeedToGather;
+ bool NeedToGather = false;
+
+ /// Records optional shuffle mask for the uses of jumbled memory accesses.
+ /// For example, a non-empty ShuffleMask[1] represents the permutation of
+ /// lanes that operand #1 of this vectorized instruction should undergo
+ /// before feeding this vectorized instruction, whereas an empty
+ /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
+ /// instruction need not be permuted at all.
+ SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
/// Points back to the VectorizableTree.
///
@@ -501,12 +750,31 @@ private:
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
- int &UserTreeIdx) {
+ int &UserTreeIdx, const InstructionsState &S,
+ ArrayRef<unsigned> ShuffleMask = None,
+ int OpdNum = 0) {
+ assert((!Vectorized || S.Opcode != 0) &&
+ "Vectorized TreeEntry without opcode");
VectorizableTree.emplace_back(VectorizableTree);
+
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
+
+ TreeEntry *UserTreeEntry = nullptr;
+ if (UserTreeIdx != -1)
+ UserTreeEntry = &VectorizableTree[UserTreeIdx];
+
+ if (UserTreeEntry && !ShuffleMask.empty()) {
+ if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
+ UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
+ assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+ "Mask already present");
+ using mask = SmallVector<unsigned, 4>;
+ mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
+ UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
+ }
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -548,16 +816,19 @@ private:
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){}
+ ExternalUser(Value *S, llvm::User *U, int L)
+ : Scalar(S), User(U), Lane(L) {}
+
// Which scalar in our function.
Value *Scalar;
+
// Which user that uses the scalar.
llvm::User *User;
+
// Which lane does the scalar belong to.
int Lane;
};
- typedef SmallVector<ExternalUser, 16> UserList;
+ using UserList = SmallVector<ExternalUser, 16>;
/// Checks if two instructions may access the same memory.
///
@@ -565,7 +836,6 @@ private:
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
-
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
@@ -583,7 +853,7 @@ private:
return aliased;
}
- typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+ using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
@@ -616,6 +886,7 @@ private:
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
+
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -624,18 +895,13 @@ private:
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
-
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
- ScheduleData()
- : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
- NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
- Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
- UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+ ScheduleData() = default;
- void init(int BlockSchedulingRegionID) {
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
@@ -643,6 +909,7 @@ private:
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
+ OpValue = OpVal;
}
/// Returns true if the dependency information has been calculated.
@@ -702,19 +969,19 @@ private:
}
}
- Instruction *Inst;
+ Instruction *Inst = nullptr;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
- ScheduleData *FirstInBundle;
+ ScheduleData *FirstInBundle = nullptr;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
- ScheduleData *NextInBundle;
+ ScheduleData *NextInBundle = nullptr;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
- ScheduleData *NextLoadStore;
+ ScheduleData *NextLoadStore = nullptr;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
@@ -722,31 +989,33 @@ private:
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
- int SchedulingRegionID;
+ int SchedulingRegionID = 0;
/// Used for getting a "good" final ordering of instructions.
- int SchedulingPriority;
+ int SchedulingPriority = 0;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
- ///
- int Dependencies;
+ int Dependencies = InvalidDeps;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
- int UnscheduledDeps;
+ int UnscheduledDeps = InvalidDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
- int UnscheduledDepsInBundle;
+ int UnscheduledDepsInBundle = InvalidDeps;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
- bool IsScheduled;
+ bool IsScheduled = false;
+
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
};
#ifndef NDEBUG
@@ -756,22 +1025,14 @@ private:
return os;
}
#endif
+
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
- ///
struct BlockScheduling {
-
BlockScheduling(BasicBlock *BB)
- : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
- ScheduleStart(nullptr), ScheduleEnd(nullptr),
- FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
- ScheduleRegionSize(0),
- ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
- // Make sure that the initial SchedulingRegionID is greater than the
- // initial SchedulingRegionID in ScheduleData (which is 0).
- SchedulingRegionID(1) {}
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
void clear() {
ReadyInsts.clear();
@@ -799,6 +1060,18 @@ private:
return nullptr;
}
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ if (V == Key)
+ return getScheduleData(V);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end()) {
+ ScheduleData *SD = I->second[Key];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ }
+ return nullptr;
+ }
+
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
@@ -812,19 +1085,29 @@ private:
ScheduleData *BundleMember = SD;
while (BundleMember) {
+ if (BundleMember->Inst != BundleMember->OpValue) {
+ BundleMember = BundleMember->NextInBundle;
+ continue;
+ }
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
- ScheduleData *OpDef = getScheduleData(U.get());
- if (OpDef && OpDef->hasValidDependencies() &&
- OpDef->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after decrementing,
- // so we can put the dependent instruction into the ready list.
- ScheduleData *DepBundle = OpDef->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
- }
+ auto *I = dyn_cast<Instruction>(U.get());
+ if (!I)
+ continue;
+ doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -835,22 +1118,35 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
+ << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ if (ScheduleData *SD = getScheduleData(V))
+ Action(SD);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (P.second->SchedulingRegionID == SchedulingRegionID)
+ Action(P.second);
+ }
+
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- if (SD->isSchedulingEntity() && SD->isReady()) {
- ReadyList.insert(SD);
- DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
- }
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ }
+ });
}
}
@@ -862,9 +1158,12 @@ private:
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+ /// Allocates schedule data chunk.
+ ScheduleData *allocateScheduleDataChunks();
+
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V);
+ bool extendSchedulingRegion(Value *V, Value *OpValue);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -897,6 +1196,10 @@ private:
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+ /// Attaches ScheduleData to Instruction with the leading key.
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+ ExtraScheduleDataMap;
+
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
@@ -905,28 +1208,30 @@ private:
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
- Instruction *ScheduleStart;
+ Instruction *ScheduleStart = nullptr;
/// The first instruction _after_ the scheduling region.
- Instruction *ScheduleEnd;
+ Instruction *ScheduleEnd = nullptr;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
- ScheduleData *FirstLoadStoreInRegion;
+ ScheduleData *FirstLoadStoreInRegion = nullptr;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
- ScheduleData *LastLoadStoreInRegion;
+ ScheduleData *LastLoadStoreInRegion = nullptr;
/// The current size of the scheduling region.
- int ScheduleRegionSize;
+ int ScheduleRegionSize = 0;
/// The maximum size allowed for the scheduling region.
- int ScheduleRegionSizeLimit;
+ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
- int SchedulingRegionID;
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ int SchedulingRegionID = 1;
};
/// Attaches the BlockScheduling structures to basic blocks.
@@ -940,10 +1245,10 @@ private:
ArrayRef<Value *> UserIgnoreList;
// Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder;
+ int NumLoadsWantToKeepOrder = 0;
// Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder;
+ int NumLoadsWantToChangeOrder = 0;
// Analysis and block reference.
Function *F;
@@ -960,6 +1265,7 @@ private:
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
@@ -970,20 +1276,20 @@ private:
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
+
} // end namespace slpvectorizer
template <> struct GraphTraits<BoUpSLP *> {
- typedef BoUpSLP::TreeEntry TreeEntry;
+ using TreeEntry = BoUpSLP::TreeEntry;
/// NodeRef has to be a pointer per the GraphWriter.
- typedef TreeEntry *NodeRef;
+ using NodeRef = TreeEntry *;
/// \brief Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<ChildIteratorType,
SmallVector<int, 1>::iterator> {
-
std::vector<TreeEntry> &VectorizableTree;
ChildIteratorType(SmallVector<int, 1>::iterator W,
@@ -998,17 +1304,19 @@ template <> struct GraphTraits<BoUpSLP *> {
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
}
+
static ChildIteratorType child_end(NodeRef N) {
return {N->UserTreeIndices.end(), N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
- typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+ using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
}
+
static nodes_iterator nodes_end(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.end());
}
@@ -1017,7 +1325,7 @@ template <> struct GraphTraits<BoUpSLP *> {
};
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
- typedef BoUpSLP::TreeEntry TreeEntry;
+ using TreeEntry = BoUpSLP::TreeEntry;
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
@@ -1054,6 +1362,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
}
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) {
@@ -1118,44 +1427,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
- int UserTreeIdx) {
- bool isAltShuffle = false;
+ int UserTreeIdx, int OpdNum) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+ InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
// Don't handle vectors.
- if (VL[0]->getType()->isVectorTy()) {
+ if (S.OpValue->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- unsigned Opcode = getSameOpcode(VL);
-
- // Check that this shuffle vector refers to the alternate
- // sequence of opcodes.
- if (Opcode == Instruction::ShuffleVector) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- unsigned Op = I0->getOpcode();
- if (Op != Instruction::ShuffleVector)
- isAltShuffle = true;
- }
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
@@ -1167,87 +1466,92 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Check if this is a duplicate of another entry.
- if (TreeEntry *E = getTreeEntry(VL[0])) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+ DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (ScalarToTreeEntry.count(VL[i])) {
+ auto *I = dyn_cast<Instruction>(VL[i]);
+ if (!I)
+ continue;
+ if (getTreeEntry(I)) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- // If any of the scalars is marked as a value that needs to stay scalar then
+ // If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
- Instruction *VL0 = cast<Instruction>(VL[0]);
+ auto *VL0 = cast<Instruction>(S.OpValue);
BasicBlock *BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- // Check that every instructions appears once in this bundle.
+ // Check that every instruction appears once in this bundle.
for (unsigned i = 0, e = VL.size(); i < e; ++i)
- for (unsigned j = i+1; j < e; ++j)
+ for (unsigned j = i + 1; j < e; ++j)
if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
auto &BSRef = BlocksSchedules[BB];
- if (!BSRef) {
+ if (!BSRef)
BSRef = llvm::make_unique<BlockScheduling>(BB);
- }
+
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, VL0)) {
+ if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- assert((!BS.getScheduleData(VL[0]) ||
- !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+ assert((!BS.getScheduleData(VL0) ||
+ !BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- switch (Opcode) {
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1259,12 +1563,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1274,35 +1578,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, Opcode);
+ bool Reuse = canReuseExtract(VL, VL0);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
BS.cancelScheduling(VL, VL0);
}
- newTreeEntry(VL, Reuse, UserTreeIdx);
+ newTreeEntry(VL, Reuse, UserTreeIdx, S);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
- // load.
- // For example we don't want vectorize loads that are smaller than 8 bit.
- // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
- // loading/storing it as an i8 struct. If we vectorize loads/stores from
- // such a struct we read/write packed bits disagreeing with the
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
- Type *ScalarTy = VL[0]->getType();
+ Type *ScalarTy = VL0->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
@@ -1313,15 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
}
// Check if the loads are consecutive, reversed, or neither.
- // TODO: What we really want is to sort the loads, but for now, check
- // the two likely directions.
bool Consecutive = true;
bool ReverseConsecutive = true;
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1335,7 +1636,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Consecutive) {
++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
}
@@ -1349,15 +1650,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
break;
}
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
-
if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
- } else {
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ ++NumLoadsWantToChangeOrder;
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, false, UserTreeIdx, S);
+ return;
+ }
+
+ if (VL.size() > 2) {
+ bool ShuffledLoads = true;
+ SmallVector<Value *, 8> Sorted;
+ SmallVector<unsigned, 4> Mask;
+ if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
+ auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
+ for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
+ if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
+ ShuffledLoads = false;
+ break;
+ }
+ }
+ // TODO: Tracking how many load wants to have arbitrary shuffled order
+ // would be usefull.
+ if (ShuffledLoads) {
+ DEBUG(dbgs() << "SLP: added a vector of loads which needs "
+ "permutation of loaded lanes.\n");
+ newTreeEntry(NewVL, true, UserTreeIdx, S,
+ makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
+ return;
+ }
+ }
}
+
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
case Instruction::ZExt:
@@ -1377,12 +1704,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1391,7 +1718,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1399,19 +1726,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
+ Type *ComparedTy = VL0->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1420,7 +1747,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1442,17 +1769,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
- newTreeEntry(VL, true, UserTreeIdx);
+ case Instruction::Xor:
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right);
+ reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
return;
}
@@ -1462,30 +1789,30 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
- }
+
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
- Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+ Type *Ty0 = VL0->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
@@ -1497,12 +1824,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
@@ -1510,7 +1837,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1519,12 +1846,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
@@ -1536,13 +1863,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
- CallInst *CI = cast<CallInst>(VL[0]);
+ CallInst *CI = cast<CallInst>(VL0);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
@@ -1556,7 +1883,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
@@ -1567,7 +1894,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
@@ -1580,14 +1907,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1595,28 +1922,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
- case Instruction::ShuffleVector: {
+ case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
- if (!isAltShuffle) {
+ if (!S.IsAltShuffle) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(VL, Left, Right);
+ reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
return;
}
@@ -1626,13 +1953,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
- }
+
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
@@ -1663,19 +1990,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return N;
}
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
- assert(Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue);
- assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+ Instruction *E0 = cast<Instruction>(OpValue);
+ assert(E0->getOpcode() == Instruction::ExtractElement ||
+ E0->getOpcode() == Instruction::ExtractValue);
+ assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
- Value *VL0 = VL[0];
- Instruction *E0 = cast<Instruction>(VL0);
Value *Vec = E0->getOperand(0);
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
- if (Opcode == Instruction::ExtractValue) {
+ if (E0->getOpcode() == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
@@ -1692,20 +2018,24 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
return false;
// Check that all of the indices extract from the correct offset.
- if (!matchExtractIndex(E0, 0, Opcode))
- return false;
-
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
- if (!matchExtractIndex(E, i, Opcode))
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Instruction *Inst = cast<Instruction>(VL[I]);
+ if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
return false;
- if (E->getOperand(0) != Vec)
+ if (Inst->getOperand(0) != Vec)
return false;
}
return true;
}
+bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
+ return I->hasOneUse() ||
+ std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0;
+ });
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -1728,28 +2058,47 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
+ if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (ShuffleKind.hasValue()) {
+ int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (areAllUsersVectorized(cast<Instruction>(V)) &&
+ !ScalarToTreeEntry.count(V)) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ IO->getZExtValue());
+ }
+ }
+ return Cost;
+ }
+ }
return getGatherCost(E->Scalars);
}
- unsigned Opcode = getSameOpcode(VL);
- assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
- Instruction *VL0 = cast<Instruction>(VL[0]);
- switch (Opcode) {
- case Instruction::PHI: {
+ InstructionsState S = getSameOpcode(VL);
+ assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = cast<Instruction>(S.OpValue);
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
+ case Instruction::PHI:
return 0;
- }
+
case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- if (canReuseExtract(VL, Opcode)) {
+ case Instruction::ExtractElement:
+ if (canReuseExtract(VL, S.OpValue)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (E->hasOneUse() ||
- std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0;
- }))
+ if (areAllUsersVectorized(E))
// Take credit for instruction that will become dead.
DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1757,7 +2106,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return -DeadCost;
}
return getGatherCost(VecTy);
- }
+
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -1786,8 +2135,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
- int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
+ TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+ int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::Add:
@@ -1848,9 +2197,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarCost =
VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
Op2VP, Operands);
- int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+ int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
@@ -1968,7 +2317,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
}
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
-
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -2139,13 +2487,18 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
-void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
- for (Value *i : VL) {
- Left.push_back(cast<Instruction>(i)->getOperand(0));
- Right.push_back(cast<Instruction>(i)->getOperand(1));
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ (void)AltOpcode;
+ for (Value *V : VL) {
+ auto *I = cast<Instruction>(V);
+ assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
+ "Incorrect instruction in vector");
+ Left.push_back(I->getOperand(0));
+ Right.push_back(I->getOperand(1));
}
// Reorder if we have a commutative operation and consecutive access
@@ -2190,14 +2543,12 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
// The vectorizer is trying to either have all elements one side being
// instruction with the same opcode to enable further vectorization, or having
// a splat to lower the vectorizing cost.
-static bool shouldReorderOperands(int i, Instruction &I,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- bool AllSameOpcodeLeft,
- bool AllSameOpcodeRight, bool SplatLeft,
- bool SplatRight) {
- Value *VLeft = I.getOperand(0);
- Value *VRight = I.getOperand(1);
+static bool shouldReorderOperands(
+ int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
+ ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
+ bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
+ VLeft = I.getOperand(0);
+ VRight = I.getOperand(1);
// If we have "SplatRight", try to see if commuting is needed to preserve it.
if (SplatRight) {
if (VRight == Right[i - 1])
@@ -2253,15 +2604,16 @@ static bool shouldReorderOperands(int i, Instruction &I,
return false;
}
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
-
- if (VL.size()) {
+ if (!VL.empty()) {
// Peel the first iteration out of the loop since there's nothing
// interesting to do anyway and it simplifies the checks in the loop.
- auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
- auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+ auto *I = cast<Instruction>(VL[0]);
+ Value *VLeft = I->getOperand(0);
+ Value *VRight = I->getOperand(1);
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight);
@@ -2278,16 +2630,21 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
- assert(I->isCommutative() && "Can only process commutative instruction");
+ assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
+ (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
+ "Can only process commutative instruction");
// Commute to favor either a splat or maximizing having the same opcodes on
// one side.
- if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
- AllSameOpcodeRight, SplatLeft, SplatRight)) {
- Left.push_back(I->getOperand(1));
- Right.push_back(I->getOperand(0));
+ Value *VLeft;
+ Value *VRight;
+ if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
+ VRight)) {
+ Left.push_back(VRight);
+ Right.push_back(VLeft);
} else {
- Left.push_back(I->getOperand(0));
- Right.push_back(I->getOperand(1));
+ Left.push_back(VLeft);
+ Right.push_back(VRight);
}
// Update Splat* and AllSameOpcode* after the insertion.
SplatRight = SplatRight && (Right[i - 1] == Right[i]);
@@ -2340,14 +2697,17 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
}
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
-
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(VL.front());
+ auto *Front = cast<Instruction>(OpValue);
auto *BB = Front->getParent();
- assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
- return cast<Instruction>(V)->getParent() == BB;
+ const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
+ const unsigned AltOpcode = getAltOpcode(Opcode);
+ assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
+ return !sameOpcodeOrAlt(Opcode, AltOpcode,
+ cast<Instruction>(V)->getOpcode()) ||
+ cast<Instruction>(V)->getParent() == BB;
}));
// The last instruction in the bundle in program order.
@@ -2358,10 +2718,12 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
- auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
+ auto *Bundle =
+ BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
- LastInst = Bundle->Inst;
+ if (Bundle->OpValue == Bundle->Inst)
+ LastInst = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
@@ -2385,7 +2747,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I))
+ if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
LastInst = &I;
if (Bundle.empty())
break;
@@ -2435,27 +2797,41 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
return nullptr;
}
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
- if (TreeEntry *E = getTreeEntry(VL[0]))
- if (E->isSame(VL))
- return vectorizeTree(E);
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
+ InstructionsState S = getSameOpcode(VL);
+ if (S.Opcode) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ TreeEntry *UserTreeEntry = nullptr;
+ if (UserIndx != -1)
+ UserTreeEntry = &VectorizableTree[UserIndx];
+
+ if (E->isSame(VL) ||
+ (UserTreeEntry &&
+ (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+ !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+ E->isFoundJumbled(VL, *DL, *SE)))
+ return vectorizeTree(E, OpdNum, UserIndx);
+ }
+ }
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ Type *ScalarTy = S.OpValue->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
return Gather(VL, VecTy);
}
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
IRBuilder<>::InsertPointGuard Guard(Builder);
+ TreeEntry *UserTreeEntry = nullptr;
if (E->VectorizedValue) {
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
+ InstructionsState S = getSameOpcode(E->Scalars);
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
@@ -2463,15 +2839,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
- unsigned Opcode = getSameOpcode(E->Scalars);
+ assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
+ "Expected user tree entry, missing!");
+ int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
- switch (Opcode) {
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
@@ -2498,7 +2878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeTree(Operands);
+ Value *Vec = vectorizeTree(Operands, i, CurrIndx);
NewPhi->addIncoming(Vec, IBB);
}
@@ -2508,18 +2888,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
+ if (canReuseExtract(E->Scalars, VL0)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+ if (canReuseExtract(E->Scalars, VL0)) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
@@ -2528,7 +2908,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = V;
return propagateMetadata(V, E->Scalars);
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
@@ -2549,9 +2929,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *InVec = vectorizeTree(INVL);
+ Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
@@ -2570,23 +2950,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *L = vectorizeTree(LHSV);
- Value *R = vectorizeTree(RHSV);
+ Value *L = vectorizeTree(LHSV, 0, CurrIndx);
+ Value *R = vectorizeTree(RHSV, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (Opcode == Instruction::FCmp)
+ if (S.Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
@@ -2598,11 +2978,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *Cond = vectorizeTree(CondVec);
- Value *True = vectorizeTree(TrueVec);
- Value *False = vectorizeTree(FalseVec);
+ Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
+ Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
+ Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
@@ -2632,25 +3012,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+ reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+ RHSVL);
else
for (Value *V : E->Scalars) {
- LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
- RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
+ auto *I = cast<Instruction>(V);
+ LHSVL.push_back(I->getOperand(0));
+ RHSVL.push_back(I->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
+ Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+ Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
- BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+ Value *V = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
@@ -2661,9 +3043,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
+
+ if (UserIndx != -1)
+ UserTreeEntry = &VectorizableTree[UserIndx];
+
+ bool isJumbled = false;
+ LoadInst *LI = NULL;
+ if (UserTreeEntry &&
+ (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+ !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
+ isJumbled = true;
+ LI = cast<LoadInst>(E->Scalars[0]);
+ } else {
+ LI = cast<LoadInst>(VL0);
+ }
- LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
@@ -2685,47 +3080,60 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
- return propagateMetadata(LI, E->Scalars);
+ propagateMetadata(LI, E->Scalars);
+
+ if (isJumbled) {
+ SmallVector<Constant *, 8> Mask;
+ for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
+ Mask.push_back(Builder.getInt32(LaneEntry));
+ // Generate shuffle for jumbled memory access
+ Value *Undef = UndefValue::get(VecTy);
+ Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
+ ConstantVector::get(Mask));
+ E->VectorizedValue = Shuf;
+ ++NumVectorInstructions;
+ return Shuf;
+ }
+ return LI;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
- ValueList ValueOp;
+ ValueList ScalarStoreValues;
for (Value *V : E->Scalars)
- ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
+ ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *VecValue = vectorizeTree(ValueOp);
- Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
- VecTy->getPointerTo(AS));
+ Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
+ Value *ScalarPtr = SI->getPointerOperand();
+ Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
- // The pointer operand uses an in-tree scalar so we add the new BitCast to
- // ExternalUses list to make sure that an extract will be generated in the
+ // The pointer operand uses an in-tree scalar, so add the new BitCast to
+ // ExternalUses to make sure that an extract will be generated in the
// future.
- Value *PO = SI->getPointerOperand();
- if (getTreeEntry(PO))
- ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+ if (getTreeEntry(ScalarPtr))
+ ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
- if (!Alignment) {
+ if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- }
+
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
ValueList Op0VL;
for (Value *V : E->Scalars)
Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
- Value *Op0 = vectorizeTree(Op0VL);
+ Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -2734,7 +3142,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
OpVecs.push_back(OpVec);
}
@@ -2750,7 +3158,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
@@ -2763,7 +3171,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// ctlz,cttz and powi are special intrinsics whose second argument is
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
- CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
@@ -2773,7 +3181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVL.push_back(CEI->getArgOperand(j));
}
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -2793,30 +3201,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
- reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
- setInsertPointAfterBundle(E->Scalars);
+ assert(Instruction::isBinaryOp(S.Opcode) &&
+ "Invalid Shuffle Vector Operand");
+ reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
+ Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+ Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
// Create a vector of LHS op1 RHS
- BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
- Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+ Value *V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ unsigned AltOpcode = getAltOpcode(S.Opcode);
// Create a vector of LHS op2 RHS
- Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
- BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
- Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+ Value *V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
@@ -2859,7 +3268,6 @@ Value *BoUpSLP::vectorizeTree() {
Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
-
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
@@ -2905,9 +3313,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
- assert(!E->NeedToGather && "Extracting from a gather list");
+ assert((!E->NeedToGather) && "Extracting from a gather list");
- Value *Vec = E->VectorizedValue;
+ Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
+ if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
+ Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
+ } else {
+ Vec = E->VectorizedValue;
+ }
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
@@ -2975,14 +3388,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- // No need to handle users of gathered values.
- if (Entry->NeedToGather)
- continue;
-
- assert(Entry->VectorizedValue && "Can't find vectorizable value");
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -2990,9 +3404,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
- assert((getTreeEntry(U) ||
- // It is legal to replace users in the ignorelist by undef.
- is_contained(UserIgnoreList, U)) &&
+ // It is legal to replace users in the ignorelist by undef.
+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
@@ -3009,7 +3422,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
return VectorizableTree[0].VectorizedValue;
}
-void BoUpSLP::optimizeGatherSequence() {
+void BoUpSLP::optimizeGatherSequence(Function &F) {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
@@ -3043,30 +3456,16 @@ void BoUpSLP::optimizeGatherSequence() {
Insert->moveBefore(PreHeader->getTerminator());
}
- // Make a list of all reachable blocks in our CSE queue.
- SmallVector<const DomTreeNode *, 8> CSEWorkList;
- CSEWorkList.reserve(CSEBlocks.size());
- for (BasicBlock *BB : CSEBlocks)
- if (DomTreeNode *N = DT->getNode(BB)) {
- assert(DT->isReachableFromEntry(N));
- CSEWorkList.push_back(N);
- }
-
- // Sort blocks by domination. This ensures we visit a block after all blocks
- // dominating it are visited.
- std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
-
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
- for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
- assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
- "Worklist not sorted properly!");
- BasicBlock *BB = (*I)->getBlock();
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (auto BB : RPOT) {
+ // Traverse CSEBlocks by RPOT order.
+ if (!CSEBlocks.count(BB))
+ continue;
+
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
@@ -3111,7 +3510,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V))
+ if (!extendSchedulingRegion(V, OpValue))
return false;
}
@@ -3148,8 +3547,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- SD->clearDependencies();
+ doForAllOpcodes(I, [](ScheduleData *SD) {
+ SD->clearDependencies();
+ });
}
ReSchedule = true;
}
@@ -3210,17 +3610,43 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
}
}
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
- if (getScheduleData(V))
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ return &(ScheduleDataChunks.back()[ChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+ Value *OpValue) {
+ if (getScheduleData(V, isOneOf(OpValue, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, OpValue);
+ ExtraScheduleDataMap[I][OpValue] = SD;
+ return true;
+ };
+ if (CheckSheduleForI(I))
+ return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -3232,7 +3658,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
- for (;;) {
+ while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
@@ -3242,6 +3668,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
return true;
}
@@ -3252,6 +3680,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -3272,21 +3702,17 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
ScheduleData *SD = ScheduleDataMap[I];
if (!SD) {
- // Allocate a new ScheduleData for the instruction.
- if (ChunkPos >= ChunkSize) {
- ScheduleDataChunks.push_back(
- llvm::make_unique<ScheduleData[]>(ChunkSize));
- ChunkPos = 0;
- }
- SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+ SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
- SD->init(SchedulingRegionID);
+ SD->init(SchedulingRegionID, I);
- if (I->mayReadOrWriteMemory()) {
+ if (I->mayReadOrWriteMemory() &&
+ (!isa<IntrinsicInst>(I) ||
+ cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
@@ -3326,23 +3752,35 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
- for (User *U : BundleMember->Inst->users()) {
- if (isa<Instruction>(U)) {
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (BundleMember->OpValue != BundleMember->Inst) {
+ ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
+ BundleMember->incrementUnscheduledDeps(1);
}
- } else {
- // I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and
- // eventually disable vectorization.
- BundleMember->Dependencies++;
- BundleMember->incrementUnscheduledDeps(1);
}
}
@@ -3419,16 +3857,17 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- assert(isInSchedulingRegion(SD));
- SD->IsScheduled = false;
- SD->resetUnscheduledDeps();
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ assert(isInSchedulingRegion(SD) &&
+ "ScheduleData not in scheduling region");
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ });
}
ReadyInsts.clear();
}
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
-
if (!BS->ScheduleStart)
return;
@@ -3452,15 +3891,16 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- ScheduleData *SD = BS->getScheduleData(I);
- assert(
- SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
- "scheduler and vectorizer have different opinion on what is a bundle");
- SD->FirstInBundle->SchedulingPriority = Idx++;
- if (SD->isSchedulingEntity()) {
- BS->calculateDependencies(SD, false, this);
- NumToSchedule++;
- }
+ BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ assert(SD->isPartOfBundle() ==
+ (getTreeEntry(SD->Inst) != nullptr) &&
+ "scheduler and vectorizer bundle mismatch");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ });
}
BS->initialFillReadyList(ReadyInsts);
@@ -3559,7 +3999,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
SmallVectorImpl<Value *> &ToDemote,
SmallVectorImpl<Value *> &Roots) {
-
// We can always demote constants.
if (isa<Constant>(V)) {
ToDemote.push_back(V);
@@ -3702,7 +4141,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
- IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+ IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
KnownBits Known = computeKnownBits(R, *DL);
return Known.isNonNegative();
});
@@ -3710,7 +4149,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine the maximum number of bits required to store the scalar
// values.
for (auto *Scalar : ToDemote) {
- auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
@@ -3755,6 +4194,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}
namespace {
+
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
SLPVectorizerPass Impl;
@@ -3766,7 +4206,6 @@ struct SLPVectorizer : public FunctionPass {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}
-
bool doInitialization(Module &M) override {
return false;
}
@@ -3806,6 +4245,7 @@ struct SLPVectorizer : public FunctionPass {
AU.setPreservesCFG();
}
};
+
} // end anonymous namespace
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
@@ -3893,7 +4333,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
}
if (Changed) {
- R.optimizeGatherSequence();
+ R.optimizeGatherSequence(F);
DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
DEBUG(verifyFunction(F));
}
@@ -3952,7 +4392,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+
using namespace ore;
+
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
cast<StoreInst>(Chain[i]))
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
@@ -3972,7 +4414,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
- SetVector<StoreInst *> Heads, Tails;
+ SetVector<StoreInst *> Heads;
+ SmallDenseSet<StoreInst *> Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
@@ -3980,45 +4423,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- // Do a quadratic search on all of the given stores and find
+ // Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.
SmallVector<unsigned, 16> IndexQueue;
- for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- IndexQueue.clear();
+ unsigned E = Stores.size();
+ IndexQueue.resize(E - 1);
+ for (unsigned I = E; I > 0; --I) {
+ unsigned Idx = I - 1;
// If a store has multiple consecutive store candidates, search Stores
- // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
- unsigned j = 0;
- for (j = i + 1; j < e; ++j)
- IndexQueue.push_back(j);
- for (j = i; j > 0; --j)
- IndexQueue.push_back(j - 1);
-
- for (auto &k : IndexQueue) {
- if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
- Tails.insert(Stores[k]);
- Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[k];
+ unsigned Offset = 1;
+ unsigned Cnt = 0;
+ for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
+ if (Idx >= Offset) {
+ IndexQueue[Cnt] = Idx - Offset;
+ ++Cnt;
+ }
+ if (Idx + Offset < E) {
+ IndexQueue[Cnt] = Idx + Offset;
+ ++Cnt;
+ }
+ }
+
+ for (auto K : IndexQueue) {
+ if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
+ Tails.insert(Stores[Idx]);
+ Heads.insert(Stores[K]);
+ ConsecutiveChain[Stores[K]] = Stores[Idx];
break;
}
}
}
// For stores that start but don't end a link in the chain:
- for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
- it != e; ++it) {
- if (Tails.count(*it))
+ for (auto *SI : llvm::reverse(Heads)) {
+ if (Tails.count(SI))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- StoreInst *I = *it;
+ StoreInst *I = SI;
// Collect the chain into a list.
- while (Tails.count(I) || Heads.count(I)) {
- if (VectorizedStores.count(I))
- break;
+ while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
Operands.push_back(I);
// Move to the next value in the chain.
I = ConsecutiveChain[I];
@@ -4041,7 +4490,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
}
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
-
// Initialize the collections. We will make a single pass over the block.
Stores.clear();
GEPs.clear();
@@ -4050,7 +4498,6 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Stores and GEPs according to the underlying objects of their pointer
// operands.
for (Instruction &I : *BB) {
-
// Ignore store instructions that are volatile or have a pointer operand
// that doesn't point to a scalar type.
if (auto *SI = dyn_cast<StoreInst>(&I)) {
@@ -4086,7 +4533,8 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
- bool AllowReorder) {
+ bool AllowReorder,
+ bool NeedExtraction) {
if (VL.size() < 2)
return false;
@@ -4103,19 +4551,51 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
- if (MaxVF < 2)
- return false;
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
for (Value *V : VL) {
Type *Ty = V->getType();
- if (!isValidElementType(Ty))
+ if (!isValidElementType(Ty)) {
+ // NOTE: the following will give user internal llvm type name, which may not be useful
+ R.getORE()->emit([&]() {
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(
+ SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
+ });
return false;
+ }
Instruction *Inst = dyn_cast<Instruction>(V);
- if (!Inst || Inst->getOpcode() != Opcode0)
+
+ if (!Inst)
+ return false;
+ if (Inst->getOpcode() != Opcode0) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "InequableTypes", I0)
+ << "Cannot SLP vectorize list: not all of the "
+ << "parts of scalar instructions are of the same type: "
+ << ore::NV("Instruction1Opcode", I0) << " and "
+ << ore::NV("Instruction2Opcode", Inst);
+ });
return false;
+ }
}
bool Changed = false;
+ bool CandidateFound = false;
+ int MinCost = SLPCostThreshold;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
@@ -4148,11 +4628,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ ArrayRef<Value *> EmptyArray;
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(I, OpsWidth);
- R.buildTree(Ops, BuildVectorSlice);
+ R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
@@ -4169,14 +4650,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
+ CandidateFound = true;
+ MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
- cast<Instruction>(Ops[0]))
- << "SLP vectorized with cost " << ore::NV("Cost", Cost)
- << " and with tree size "
- << ore::NV("TreeSize", R.getTreeSize()));
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
Value *VectorizedRoot = R.vectorizeTree();
@@ -4199,8 +4682,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
- I->removeFromParent();
- I->insertAfter(Extract);
+ I->moveAfter(Extract);
InsertAfter = I;
}
}
@@ -4212,18 +4694,37 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}
}
+ if (!Changed && CandidateFound) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
+ });
+ } else if (!Changed) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
+ });
+ }
return Changed;
}
-bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
- if (!V)
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
return false;
- Value *P = V->getParent();
+ if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ return false;
+
+ Value *P = I->getParent();
// Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
@@ -4286,6 +4787,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
}
namespace {
+
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
@@ -4314,17 +4816,375 @@ namespace {
/// *p =
///
class HorizontalReduction {
- SmallVector<Value *, 16> ReductionOps;
+ using ReductionOpsType = SmallVector<Value *, 16>;
+ using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
+ ReductionOpsListType ReductionOps;
SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
- BinaryOperator *ReductionRoot = nullptr;
+ /// Kind of the reduction data.
+ enum ReductionKind {
+ RK_None, /// Not a reduction.
+ RK_Arithmetic, /// Binary reduction data.
+ RK_Min, /// Minimum reduction data.
+ RK_UMin, /// Unsigned minimum reduction data.
+ RK_Max, /// Maximum reduction data.
+ RK_UMax, /// Unsigned maximum reduction data.
+ };
+
+ /// Contains info about operation, like its opcode, left and right operands.
+ class OperationData {
+ /// Opcode of the instruction.
+ unsigned Opcode = 0;
+
+ /// Left operand of the reduction operation.
+ Value *LHS = nullptr;
+
+ /// Right operand of the reduction operation.
+ Value *RHS = nullptr;
+
+ /// Kind of the reduction operation.
+ ReductionKind Kind = RK_None;
+
+ /// True if float point min/max reduction has no NaNs.
+ bool NoNaN = false;
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable() const {
+ return LHS && RHS &&
+ // We currently only support adds && min/max reductions.
+ ((Kind == RK_Arithmetic &&
+ (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
+ ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ (Kind == RK_Min || Kind == RK_Max)) ||
+ (Opcode == Instruction::ICmp &&
+ (Kind == RK_UMin || Kind == RK_UMax)));
+ }
+
+ /// Creates reduction operation with the current opcode.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ Value *Cmp;
+ switch (Kind) {
+ case RK_Arithmetic:
+ return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+ Name);
+ case RK_Min:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
+ : Builder.CreateFCmpOLT(LHS, RHS);
+ break;
+ case RK_Max:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
+ : Builder.CreateFCmpOGT(LHS, RHS);
+ break;
+ case RK_UMin:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpULT(LHS, RHS);
+ break;
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpUGT(LHS, RHS);
+ break;
+ case RK_None:
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+
+ public:
+ explicit OperationData() = default;
+
+ /// Construction for reduced values. They are identified by opcode only and
+ /// don't have associated LHS/RHS values.
+ explicit OperationData(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ Opcode = I->getOpcode();
+ }
+
+ /// Constructor for reduction operations with opcode and its left and
+ /// right operands.
+ OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
+ bool NoNaN = false)
+ : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
+ assert(Kind != RK_None && "One of the reduction operations is expected.");
+ }
+
+ explicit operator bool() const { return Opcode; }
+
+ /// Get the index of the first operand.
+ unsigned getFirstOperandIndex() const {
+ assert(!!*this && "The opcode is not set.");
+ switch (Kind) {
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 1;
+ case RK_Arithmetic:
+ case RK_None:
+ break;
+ }
+ return 0;
+ }
+
+ /// Total number of operands in the reduction operation.
+ unsigned getNumberOfOperands() const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return 2;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 3;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the operation has the same parent as \p P.
+ bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ if (!IsRedOp)
+ return I->getParent() == P;
+ switch (Kind) {
+ case RK_Arithmetic:
+ // Arithmetic reduction operation must be used once only.
+ return I->getParent() == P;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax: {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+ return I->getParent() == P && Cmp && Cmp->getParent() == P;
+ }
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+ /// Expected number of uses for reduction operations/reduced values.
+ bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->hasOneUse();
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return I->hasNUses(2) &&
+ (!IsReductionOp ||
+ cast<SelectInst>(I)->getCondition()->hasOneUse());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Initializes the list of reduction operations.
+ void initReductionOps(ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps.assign(1, ReductionOpsType());
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps.assign(2, ReductionOpsType());
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+ /// Add all reduction operations for the reduction instruction \p I.
+ void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps[0].emplace_back(I);
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+ ReductionOps[1].emplace_back(I);
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+
+ /// Checks if instruction is associative and can be vectorized.
+ bool isAssociative(Instruction *I) const {
+ assert(Kind != RK_None && *this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->isAssociative();
+ case RK_Min:
+ case RK_Max:
+ return Opcode == Instruction::ICmp ||
+ cast<Instruction>(I->getOperand(0))->isFast();
+ case RK_UMin:
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp &&
+ "Only integer compare operation is expected.");
+ return true;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable(Instruction *I) const {
+ return isVectorizable() && isAssociative(I);
+ }
+
+ /// Checks if two operation data are both a reduction op or both a reduced
+ /// value.
+ bool operator==(const OperationData &OD) {
+ assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
+ "One of the comparing operations is incorrect.");
+ return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
+ }
+ bool operator!=(const OperationData &OD) { return !(*this == OD); }
+ void clear() {
+ Opcode = 0;
+ LHS = nullptr;
+ RHS = nullptr;
+ Kind = RK_None;
+ NoNaN = false;
+ }
+
+ /// Get the opcode of the reduction operation.
+ unsigned getOpcode() const {
+ assert(isVectorizable() && "Expected vectorizable operation.");
+ return Opcode;
+ }
+
+ /// Get kind of reduction data.
+ ReductionKind getKind() const { return Kind; }
+ Value *getLHS() const { return LHS; }
+ Value *getRHS() const { return RHS; }
+ Type *getConditionType() const {
+ switch (Kind) {
+ case RK_Arithmetic:
+ return nullptr;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ return CmpInst::makeCmpResultType(LHS->getType());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p ReductionOps.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ const ReductionOpsListType &ReductionOps) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, ReductionOps[0]);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(SI->getCondition(), ReductionOps[0]);
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p I.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ Instruction *I) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op)) {
+ propagateIRFlags(SI->getCondition(),
+ cast<SelectInst>(I)->getCondition());
+ }
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+
+ TargetTransformInfo::ReductionFlags getFlags() const {
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ switch (Kind) {
+ case RK_Arithmetic:
+ break;
+ case RK_Min:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_Max:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_UMin:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_UMax:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ return Flags;
+ }
+ };
+
+ Instruction *ReductionRoot = nullptr;
+
+ /// The operation data of the reduction operation.
+ OperationData ReductionData;
+
+ /// The operation data of the values we perform a reduction on.
+ OperationData ReducedValueData;
- /// The opcode of the reduction.
- Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
- /// The opcode of the values we perform a reduction on.
- unsigned ReducedValueOpcode = 0;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction = false;
@@ -4349,55 +5209,89 @@ class HorizontalReduction {
}
}
+ static OperationData getOperationData(Value *V) {
+ if (!V)
+ return OperationData();
+
+ Value *LHS;
+ Value *RHS;
+ if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
+ return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
+ RK_Arithmetic);
+ }
+ if (auto *Select = dyn_cast<SelectInst>(V)) {
+ // Look for a min/max pattern.
+ if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+ } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+ } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+ } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+ } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ }
+ }
+ return OperationData(V);
+ }
+
public:
HorizontalReduction() = default;
/// \brief Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
+ ReductionData = getOperationData(B);
+
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
- if (B->getOperand(0) == Phi) {
+ if (ReductionData.getLHS() == Phi) {
Phi = nullptr;
- B = dyn_cast<BinaryOperator>(B->getOperand(1));
- } else if (B->getOperand(1) == Phi) {
+ B = dyn_cast<Instruction>(ReductionData.getRHS());
+ ReductionData = getOperationData(B);
+ } else if (ReductionData.getRHS() == Phi) {
Phi = nullptr;
- B = dyn_cast<BinaryOperator>(B->getOperand(0));
+ B = dyn_cast<Instruction>(ReductionData.getLHS());
+ ReductionData = getOperationData(B);
}
}
- if (!B)
+ if (!ReductionData.isVectorizable(B))
return false;
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
- ReductionOpcode = B->getOpcode();
- ReducedValueOpcode = 0;
+ ReducedValueData.clear();
ReductionRoot = B;
- // We currently only support adds.
- if ((ReductionOpcode != Instruction::Add &&
- ReductionOpcode != Instruction::FAdd) ||
- !B->isAssociative())
- return false;
-
// Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators or selects.
+ // trees containing only binary operators.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(B, 0));
+ Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
+ ReductionData.initReductionOps(ReductionOps);
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
- bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+ OperationData OpData = getOperationData(TreeN);
+ bool IsReducedValue = OpData != ReductionData;
// Postorder vist.
- if (EdgeToVist == 2 || IsReducedValue) {
+ if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
if (IsReducedValue)
ReducedVals.push_back(TreeN);
else {
@@ -4415,7 +5309,7 @@ public:
markExtraArg(Stack[Stack.size() - 2], TreeN);
ExtraArgs.erase(TreeN);
} else
- ReductionOps.push_back(TreeN);
+ ReductionData.addReductionOps(TreeN, ReductionOps);
}
// Retract.
Stack.pop_back();
@@ -4426,45 +5320,50 @@ public:
Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);
+ OpData = getOperationData(I);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the
// reduced value class.
- if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
- I->getOpcode() == ReductionOpcode)) {
+ if (I && (!ReducedValueData || OpData == ReducedValueData ||
+ OpData == ReductionData)) {
+ const bool IsReductionOperation = OpData == ReductionData;
// Only handle trees in the current basic block.
- if (I->getParent() != B->getParent()) {
+ if (!ReductionData.hasSameParent(I, B->getParent(),
+ IsReductionOperation)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- // Each tree node needs to have one user except for the ultimate
- // reduction.
- if (!I->hasOneUse() && I != B) {
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ if (!ReductionData.hasRequiredNumberOfUses(I,
+ OpData == ReductionData) &&
+ I != B) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- if (I->getOpcode() == ReductionOpcode) {
+ if (IsReductionOperation) {
// We need to be able to reassociate the reduction operations.
- if (!I->isAssociative()) {
+ if (!OpData.isAssociative(I)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- } else if (ReducedValueOpcode &&
- ReducedValueOpcode != I->getOpcode()) {
+ } else if (ReducedValueData &&
+ ReducedValueData != OpData) {
// Make sure that the opcodes of the operations that we are going to
// reduce match.
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
- } else if (!ReducedValueOpcode)
- ReducedValueOpcode = I->getOpcode();
+ } else if (!ReducedValueData)
+ ReducedValueData = OpData;
- Stack.push_back(std::make_pair(I, 0));
+ Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
continue;
}
}
@@ -4492,7 +5391,7 @@ public:
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
- Unsafe.setUnsafeAlgebra();
+ Unsafe.setFast();
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
@@ -4501,12 +5400,15 @@ public:
// to use it.
for (auto &Pair : ExtraArgs)
ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ SmallVector<Value *, 16> IgnoreList;
+ for (auto &V : ReductionOps)
+ IgnoreList.append(V.begin(), V.end());
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, ExternallyUsedValues, ReductionOps);
+ V.buildTree(VL, ExternallyUsedValues, IgnoreList);
if (V.shouldReorder()) {
SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
+ V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
@@ -4516,17 +5418,27 @@ public:
// Estimate cost.
int Cost =
V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
- if (Cost >= -SLPCostThreshold)
- break;
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost "
+ << ore::NV("Cost", Cost) << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ break;
+ }
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
- auto *I0 = cast<Instruction>(VL[0]);
- V.getORE()->emit(
- OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ V.getORE()->emit([&]() {
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
<< "Vectorized horizontal reduction with cost "
<< ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize()));
+ << ore::NV("TreeSize", V.getTreeSize());
+ });
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4534,12 +5446,14 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
- ReducedSubTree, "bin.rdx");
- propagateIRFlags(VectorizedTree, ReductionOps);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, ReducedSubTree,
+ ReductionData.getKind());
+ VectorizedTree =
+ VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
} else
VectorizedTree = ReducedSubTree;
i += ReduxWidth;
@@ -4551,9 +5465,10 @@ public:
for (; i < NumReducedVals; ++i) {
auto *I = cast<Instruction>(ReducedVals[i]);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree =
- Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
- propagateIRFlags(VectorizedTree, ReductionOps);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, I,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
assert(!Pair.second.empty() &&
@@ -4561,9 +5476,10 @@ public:
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
- Pair.first, "bin.extra");
- propagateIRFlags(VectorizedTree, I);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, Pair.first,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
}
}
// Update users.
@@ -4583,15 +5499,58 @@ private:
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
- int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
- int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+ int PairwiseRdxCost;
+ int SplittingRdxCost;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ PairwiseRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/true);
+ SplittingRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/false);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax: {
+ Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+ bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
+ ReductionData.getKind() == RK_UMax;
+ PairwiseRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/true, IsUnsigned);
+ SplittingRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/false, IsUnsigned);
+ break;
+ }
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
- int ScalarReduxCost =
- (ReduxWidth - 1) *
- TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
+ int ScalarReduxCost;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ ScalarReduxCost =
+ TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ ScalarReduxCost =
+ TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ break;
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
+ ScalarReduxCost *= (ReduxWidth - 1);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
@@ -4604,16 +5563,15 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps,
- const TargetTransformInfo *TTI) {
+ unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
if (!IsPairwiseReduction)
return createSimpleTargetReduction(
- Builder, TTI, ReductionOpcode, VectorizedValue,
- TargetTransformInfo::ReductionFlags(), RedOps);
+ Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
+ ReductionData.getFlags(), ReductionOps.back());
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -4627,15 +5585,16 @@ private:
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec =
- Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
- propagateIRFlags(TmpVec, RedOps);
+ OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
+ RightShuf, ReductionData.getKind());
+ TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
+
} // end anonymous namespace
/// \brief Recognize construction of vectors like
@@ -4643,39 +5602,29 @@ private:
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
+/// starting from the last insertelement instruction.
///
/// Returns true if it matches
-///
-static bool findBuildVector(InsertElementInst *FirstInsertElem,
+static bool findBuildVector(InsertElementInst *LastInsertElem,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
- if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
- return false;
-
- InsertElementInst *IE = FirstInsertElem;
- while (true) {
- BuildVector.push_back(IE);
- BuildVectorOpds.push_back(IE->getOperand(1));
-
- if (IE->use_empty())
- return false;
-
- InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
- if (!NextUse)
- return true;
-
- // If this isn't the final use, make sure the next insertelement is the only
- // use. It's OK if the final constructed vector is used multiple times
- if (!IE->hasOneUse())
+ Value *V = nullptr;
+ do {
+ BuildVector.push_back(LastInsertElem);
+ BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
+ V = LastInsertElem->getOperand(0);
+ if (isa<UndefValue>(V))
+ break;
+ LastInsertElem = dyn_cast<InsertElementInst>(V);
+ if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
-
- IE = NextUse;
- }
-
- return false;
+ } while (true);
+ std::reverse(BuildVector.begin(), BuildVector.end());
+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ return true;
}
-/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+/// \brief Like findBuildVector, but looks for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
@@ -4767,14 +5716,14 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI,
- const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
+ const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
if (!Root)
return false;
- if (Root->getParent() != BB)
+ if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
// Start analysis starting from Root instruction. If horizontal reduction is
// found, try to vectorize it. If it is not a horizontal reduction or
@@ -4795,11 +5744,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
if (!V)
continue;
auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || isa<PHINode>(Inst))
+ if (!Inst)
continue;
- if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+ auto *BI = dyn_cast<BinaryOperator>(Inst);
+ auto *SI = dyn_cast<SelectInst>(Inst);
+ if (BI || SI) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, BI)) {
+ if (HorRdx.matchAssociativeReduction(P, Inst)) {
if (HorRdx.tryToReduce(R, TTI)) {
Res = true;
// Set P to nullptr to avoid re-analysis of phi node in
@@ -4808,7 +5759,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
continue;
}
}
- if (P) {
+ if (P && BI) {
Inst = dyn_cast<Instruction>(BI->getOperand(0));
if (Inst == P)
Inst = dyn_cast<Instruction>(BI->getOperand(1));
@@ -4823,15 +5774,20 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
- if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ if (Vectorize(Inst, R)) {
Res = true;
continue;
}
// Try to vectorize operands.
+ // Continue analysis for the instruction from the same basic block only to
+ // save compile time.
if (++Level < RecursionMaxDepth)
for (auto *Op : Inst->operand_values())
- Stack.emplace_back(Op, Level);
+ if (VisitedInstrs.insert(Op).second)
+ if (auto *I = dyn_cast<Instruction>(Op))
+ if (!isa<PHINode>(I) && I->getParent() == BB)
+ Stack.emplace_back(Op, Level);
}
return Res;
}
@@ -4848,10 +5804,71 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
if (!isa<BinaryOperator>(I))
P = nullptr;
// Try to match and vectorize a horizontal reduction.
- return tryToVectorizeHorReductionOrInstOperands(
- P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
- return tryToVectorize(BI, R);
- });
+ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
+ return tryToVectorize(I, R);
+ };
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ ExtraVectorization);
+}
+
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+ BasicBlock *BB, BoUpSLP &R) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (!R.canMapToVector(IVI->getType(), DL))
+ return false;
+
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ return false;
+
+ DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ // Aggregate value is unlikely to be processed in vector register, we need to
+ // extract scalars into scalar registers, so NeedExtraction is set true.
+ return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+ BasicBlock *BB, BoUpSLP &R) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ return false;
+
+ // Vectorize starting with the build vector operands ignoring the BuildVector
+ // instructions for the purpose of scheduling and user extraction.
+ return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+ BoUpSLP &R) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+ return true;
+
+ bool OpsChanged = false;
+ for (int Idx = 0; Idx < 2; ++Idx) {
+ OpsChanged |=
+ vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+ }
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ bool OpsChanged = false;
+ for (auto &VH : reverse(Instructions)) {
+ auto *I = dyn_cast_or_null<Instruction>(VH);
+ if (!I)
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (auto *CI = dyn_cast<CmpInst>(I))
+ OpsChanged |= vectorizeCmpInst(CI, BB, R);
+ }
+ Instructions.clear();
+ return OpsChanged;
}
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4913,10 +5930,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
VisitedInstrs.clear();
+ SmallVector<WeakVH, 8> PostProcessInstructions;
+ SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(&*it).second)
+ if (!VisitedInstrs.insert(&*it).second) {
+ if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+ vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
continue;
+ }
if (isa<DbgInfoIntrinsic>(it))
continue;
@@ -4938,96 +5966,37 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- if (ShouldStartVectorizeHorAtStore) {
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ // Ran into an instruction without users, like terminator, or function call
+ // with ignored return value, store. Ignore unused instructions (basing on
+ // instruction type, except for CallInst and InvokeInst).
+ if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+ isa<InvokeInst>(it))) {
+ KeyNodes.insert(&*it);
+ bool OpsChanged = false;
+ if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+ for (auto *V : it->operand_values()) {
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
}
}
- }
-
- // Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
- if (RI->getNumOperands() != 0) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
- }
- }
-
- // Try to vectorize trees that start at compare instructions.
- if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
- if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
- Changed = true;
+ // Start vectorization of post-process list of instructions from the
+ // top-tree instructions to try to vectorize as many instructions as
+ // possible.
+ OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+ if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- continue;
- }
-
- for (int I = 0; I < 2; ++I) {
- if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
- }
- }
- continue;
- }
-
- // Try to vectorize trees that start at insertelement instructions.
- if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
- SmallVector<Value *, 16> BuildVector;
- SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
- continue;
-
- // Vectorize starting with the build vector operands ignoring the
- // BuildVector instructions for the purpose of scheduling and user
- // extraction.
- if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
Changed = true;
it = BB->begin();
e = BB->end();
+ continue;
}
-
- continue;
}
- // Try to vectorize trees that start at insertvalue instructions feeding into
- // a store.
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
- SmallVector<Value *, 16> BuildVector;
- SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
- continue;
+ if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+ isa<InsertValueInst>(it))
+ PostProcessInstructions.push_back(&*it);
- DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
- if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- }
- continue;
- }
- }
- }
}
return Changed;
@@ -5036,7 +6005,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
auto Changed = false;
for (auto &Entry : GEPs) {
-
// If the getelementptr list has fewer than two elements, there's nothing
// to do.
if (Entry.second.size() < 2)
@@ -5141,7 +6109,9 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
}
char SLPVectorizer::ID = 0;
+
static const char lv_name[] = "SLP Vectorizer";
+
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
@@ -5152,6 +6122,4 @@ INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
-namespace llvm {
-Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
-}
+Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
new file mode 100644
index 000000000000..4e54fc6db2a5
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -0,0 +1,557 @@
+//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the LLVM vectorization plan. It represents a candidate for
+/// vectorization, allowing to plan and optimize how to vectorize a given loop
+/// before generating LLVM-IR.
+/// The vectorizer uses vectorization plans to estimate the costs of potential
+/// candidates and if profitable to execute the desired plan, generating vector
+/// LLVM-IR code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan"
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
+ if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V))
+ Instr->print(OS);
+ else
+ V.printAsOperand(OS);
+ return OS;
+}
+
+/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
+ if (!Successors.empty() || !Parent)
+ return this;
+ assert(Parent->getExit() == this &&
+ "Block w/o successors not the exit of its parent.");
+ return Parent->getEnclosingBlockWithSuccessors();
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
+ if (!Predecessors.empty() || !Parent)
+ return this;
+ assert(Parent->getEntry() == this &&
+ "Block w/o predecessors not the entry of its parent.");
+ return Parent->getEnclosingBlockWithPredecessors();
+}
+
+void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
+ SmallVector<VPBlockBase *, 8> Blocks;
+ for (VPBlockBase *Block : depth_first(Entry))
+ Blocks.push_back(Block);
+
+ for (VPBlockBase *Block : Blocks)
+ delete Block;
+}
+
+BasicBlock *
+VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
+ // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
+ // Pred stands for Predessor. Prev stands for Previous - last visited/created.
+ BasicBlock *PrevBB = CFG.PrevBB;
+ BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
+ PrevBB->getParent(), CFG.LastBB);
+ DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+
+ // Hook up the new basic block to its predecessors.
+ for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+ VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
+ auto &PredVPSuccessors = PredVPBB->getSuccessors();
+ BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+ assert(PredBB && "Predecessor basic-block not found building successor.");
+ auto *PredBBTerminator = PredBB->getTerminator();
+ DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+ if (isa<UnreachableInst>(PredBBTerminator)) {
+ assert(PredVPSuccessors.size() == 1 &&
+ "Predecessor ending w/o branch must have single successor.");
+ PredBBTerminator->eraseFromParent();
+ BranchInst::Create(NewBB, PredBB);
+ } else {
+ assert(PredVPSuccessors.size() == 2 &&
+ "Predecessor ending with branch must have two successors.");
+ unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ assert(!PredBBTerminator->getSuccessor(idx) &&
+ "Trying to reset an existing successor block.");
+ PredBBTerminator->setSuccessor(idx, NewBB);
+ }
+ }
+ return NewBB;
+}
+
+void VPBasicBlock::execute(VPTransformState *State) {
+ bool Replica = State->Instance &&
+ !(State->Instance->Part == 0 && State->Instance->Lane == 0);
+ VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
+ VPBlockBase *SingleHPred = nullptr;
+ BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
+
+ // 1. Create an IR basic block, or reuse the last one if possible.
+ // The last IR basic block is reused, as an optimization, in three cases:
+ // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
+ // B. when the current VPBB has a single (hierarchical) predecessor which
+ // is PrevVPBB and the latter has a single (hierarchical) successor; and
+ // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+ // is the exit of this region from a previous instance, or the predecessor
+ // of this region.
+ if (PrevVPBB && /* A */
+ !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+ SingleHPred->getExitBasicBlock() == PrevVPBB &&
+ PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
+ !(Replica && getPredecessors().empty())) { /* C */
+ NewBB = createEmptyBasicBlock(State->CFG);
+ State->Builder.SetInsertPoint(NewBB);
+ // Temporarily terminate with unreachable until CFG is rewired.
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+ // Register NewBB in its loop. In innermost loops its the same for all BB's.
+ Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
+ L->addBasicBlockToLoop(NewBB, *State->LI);
+ State->CFG.PrevBB = NewBB;
+ }
+
+ // 2. Fill the IR basic block with IR instructions.
+ DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << NewBB->getName() << '\n');
+
+ State->CFG.VPBB2IRBB[this] = NewBB;
+ State->CFG.PrevVPBB = this;
+
+ for (VPRecipeBase &Recipe : Recipes)
+ Recipe.execute(*State);
+
+ DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+}
+
+void VPRegionBlock::execute(VPTransformState *State) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
+
+ if (!isReplicator()) {
+ // Visit the VPBlocks connected to "this", starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ return;
+ }
+
+ assert(!State->Instance && "Replicating a Region with non-null instance.");
+
+ // Enter replicating mode.
+ State->Instance = {0, 0};
+
+ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
+ State->Instance->Part = Part;
+ for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) {
+ State->Instance->Lane = Lane;
+ // Visit the VPBlocks connected to \p this, starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ }
+ }
+
+ // Exit replicating mode.
+ State->Instance.reset();
+}
+
+void VPInstruction::generateInstruction(VPTransformState &State,
+ unsigned Part) {
+ IRBuilder<> &Builder = State.Builder;
+
+ if (Instruction::isBinaryOp(getOpcode())) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+ State.set(this, V, Part);
+ return;
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *V = Builder.CreateNot(A);
+ State.set(this, V, Part);
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported opcode for instruction");
+ }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ generateInstruction(State, Part);
+}
+
+void VPInstruction::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"EMIT ";
+ print(O);
+ O << "\\l\"";
+}
+
+void VPInstruction::print(raw_ostream &O) const {
+ printAsOperand(O);
+ O << " = ";
+
+ switch (getOpcode()) {
+ case VPInstruction::Not:
+ O << "not";
+ break;
+ default:
+ O << Instruction::getOpcodeName(getOpcode());
+ }
+
+ for (const VPValue *Operand : operands()) {
+ O << " ";
+ Operand->printAsOperand(O);
+ }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
+ // 0. Set the reverse mapping from VPValues to Values for code generation.
+ for (auto &Entry : Value2VPValue)
+ State->VPValue2Value[Entry.second] = Entry.first;
+
+ BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
+ BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
+ assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
+ BasicBlock *VectorLatchBB = VectorHeaderBB;
+
+ // 1. Make room to generate basic-blocks inside loop body if needed.
+ VectorLatchBB = VectorHeaderBB->splitBasicBlock(
+ VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
+ Loop *L = State->LI->getLoopFor(VectorHeaderBB);
+ L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
+ // Remove the edge between Header and Latch to allow other connections.
+ // Temporarily terminate with unreachable until CFG is rewired.
+ // Note: this asserts the generated code's assumption that
+ // getFirstInsertionPt() can be dereferenced into an Instruction.
+ VectorHeaderBB->getTerminator()->eraseFromParent();
+ State->Builder.SetInsertPoint(VectorHeaderBB);
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+
+ // 2. Generate code in loop body.
+ State->CFG.PrevVPBB = nullptr;
+ State->CFG.PrevBB = VectorHeaderBB;
+ State->CFG.LastBB = VectorLatchBB;
+
+ for (VPBlockBase *Block : depth_first(Entry))
+ Block->execute(State);
+
+ // 3. Merge the temporary latch created with the last basic-block filled.
+ BasicBlock *LastBB = State->CFG.PrevBB;
+ // Connect LastBB to VectorLatchBB to facilitate their merge.
+ assert(isa<UnreachableInst>(LastBB->getTerminator()) &&
+ "Expected VPlan CFG to terminate with unreachable");
+ LastBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(VectorLatchBB, LastBB);
+
+ // Merge LastBB with Latch.
+ bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
+ (void)Merged;
+ assert(Merged && "Could not merge last basic block with latch.");
+ VectorLatchBB = LastBB;
+
+ updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
+}
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB) {
+ BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
+ assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
+ DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB);
+ // The vector body may be more than a single basic-block by this point.
+ // Update the dominator tree information inside the vector body by propagating
+ // it from header to latch, expecting only triangular control-flow, if any.
+ BasicBlock *PostDomSucc = nullptr;
+ for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
+ // Get the list of successors of this block.
+ std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+ assert(Succs.size() <= 2 &&
+ "Basic block in vector loop has more than 2 successors.");
+ PostDomSucc = Succs[0];
+ if (Succs.size() == 1) {
+ assert(PostDomSucc->getSinglePredecessor() &&
+ "PostDom successor has more than one predecessor.");
+ DT->addNewBlock(PostDomSucc, BB);
+ continue;
+ }
+ BasicBlock *InterimSucc = Succs[1];
+ if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
+ PostDomSucc = Succs[1];
+ InterimSucc = Succs[0];
+ }
+ assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
+ "One successor of a basic block does not lead to the other.");
+ assert(InterimSucc->getSinglePredecessor() &&
+ "Interim successor has more than one predecessor.");
+ assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 &&
+ "PostDom successor has more than two predecessors.");
+ DT->addNewBlock(InterimSucc, BB);
+ DT->addNewBlock(PostDomSucc, BB);
+ }
+}
+
+const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
+ return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
+ Twine(getOrCreateBID(Block));
+}
+
+const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
+ const std::string &Name = Block->getName();
+ if (!Name.empty())
+ return Name;
+ return "VPB" + Twine(getOrCreateBID(Block));
+}
+
+void VPlanPrinter::dump() {
+ Depth = 1;
+ bumpIndent(0);
+ OS << "digraph VPlan {\n";
+ OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
+ if (!Plan.getName().empty())
+ OS << "\\n" << DOT::EscapeString(Plan.getName());
+ if (!Plan.Value2VPValue.empty()) {
+ OS << ", where:";
+ for (auto Entry : Plan.Value2VPValue) {
+ OS << "\\n" << *Entry.second;
+ OS << DOT::EscapeString(" := ");
+ Entry.first->printAsOperand(OS, false);
+ }
+ }
+ OS << "\"]\n";
+ OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
+ OS << "edge [fontname=Courier, fontsize=30]\n";
+ OS << "compound=true\n";
+
+ for (VPBlockBase *Block : depth_first(Plan.getEntry()))
+ dumpBlock(Block);
+
+ OS << "}\n";
+}
+
+void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
+ if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
+ dumpBasicBlock(BasicBlock);
+ else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ dumpRegion(Region);
+ else
+ llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
+ bool Hidden, const Twine &Label) {
+ // Due to "dot" we print an edge between two regions as an edge between the
+ // exit basic block and the entry basic of the respective regions.
+ const VPBlockBase *Tail = From->getExitBasicBlock();
+ const VPBlockBase *Head = To->getEntryBasicBlock();
+ OS << Indent << getUID(Tail) << " -> " << getUID(Head);
+ OS << " [ label=\"" << Label << '\"';
+ if (Tail != From)
+ OS << " ltail=" << getUID(From);
+ if (Head != To)
+ OS << " lhead=" << getUID(To);
+ if (Hidden)
+ OS << "; splines=none";
+ OS << "]\n";
+}
+
+void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
+ auto &Successors = Block->getSuccessors();
+ if (Successors.size() == 1)
+ drawEdge(Block, Successors.front(), false, "");
+ else if (Successors.size() == 2) {
+ drawEdge(Block, Successors.front(), false, "T");
+ drawEdge(Block, Successors.back(), false, "F");
+ } else {
+ unsigned SuccessorNumber = 0;
+ for (auto *Successor : Successors)
+ drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
+ }
+}
+
+void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
+ OS << Indent << getUID(BasicBlock) << " [label =\n";
+ bumpIndent(1);
+ OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";
+ bumpIndent(1);
+ for (const VPRecipeBase &Recipe : *BasicBlock)
+ Recipe.print(OS, Indent);
+ bumpIndent(-2);
+ OS << "\n" << Indent << "]\n";
+ dumpEdges(BasicBlock);
+}
+
+void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
+ OS << Indent << "subgraph " << getUID(Region) << " {\n";
+ bumpIndent(1);
+ OS << Indent << "fontname=Courier\n"
+ << Indent << "label=\""
+ << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
+ << DOT::EscapeString(Region->getName()) << "\"\n";
+ // Dump the blocks of the region.
+ assert(Region->getEntry() && "Region contains no inner blocks.");
+ for (const VPBlockBase *Block : depth_first(Region->getEntry()))
+ dumpBlock(Block);
+ bumpIndent(-1);
+ OS << Indent << "}\n";
+ dumpEdges(Region);
+}
+
+void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
+ std::string IngredientString;
+ raw_string_ostream RSO(IngredientString);
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ if (!Inst->getType()->isVoidTy()) {
+ Inst->printAsOperand(RSO, false);
+ RSO << " = ";
+ }
+ RSO << Inst->getOpcodeName() << " ";
+ unsigned E = Inst->getNumOperands();
+ if (E > 0) {
+ Inst->getOperand(0)->printAsOperand(RSO, false);
+ for (unsigned I = 1; I < E; ++I)
+ Inst->getOperand(I)->printAsOperand(RSO << ", ", false);
+ }
+ } else // !Inst
+ V->printAsOperand(RSO, false);
+ RSO.flush();
+ O << DOT::EscapeString(IngredientString);
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN\\l\"";
+ for (auto &Instr : make_range(Begin, End))
+ O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\"";
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
+ const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN-INDUCTION";
+ if (Trunc) {
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\"";
+ } else
+ O << " " << VPlanIngredient(IV) << "\\l\"";
+}
+
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"BLEND ";
+ Phi->printAsOperand(O, false);
+ O << " =";
+ if (!User) {
+ // Not a User of any mask: not really blending, this is a
+ // single-predecessor phi.
+ O << " ";
+ Phi->getIncomingValue(0)->printAsOperand(O, false);
+ } else {
+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
+ O << " ";
+ Phi->getIncomingValue(I)->printAsOperand(O, false);
+ O << "/";
+ User->getOperand(I)->printAsOperand(O);
+ }
+ }
+ O << "\\l\"";
+}
+
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
+ << VPlanIngredient(Ingredient);
+ if (AlsoPack)
+ O << " (S->V)";
+ O << "\\l\"";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
+ << "\\l\"";
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
+ const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+ if (User) {
+ O << ", ";
+ User->getOperand(0)->printAsOperand(O);
+ }
+ O << "\\l\"";
+}
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
new file mode 100644
index 000000000000..2ccabfd6af25
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -0,0 +1,1194 @@
+//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of the Vectorization Plan base classes:
+/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
+/// VPBlockBase, together implementing a Hierarchical CFG;
+/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
+/// treated as proper graphs for generic algorithms;
+/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
+/// within VPBasicBlocks;
+/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
+/// instruction;
+/// 5. The VPlan class holding a candidate for vectorization;
+/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
+/// These are documented in docs/VectorizationPlan.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/IR/IRBuilder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <map>
+#include <string>
+
+// The (re)use of existing LoopVectorize classes is subject to future VPlan
+// refactoring.
+namespace {
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+} // namespace
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+class InterleaveGroup;
+class LoopInfo;
+class raw_ostream;
+class Value;
+class VPBasicBlock;
+class VPRegionBlock;
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+ /// in [0..UF)
+ unsigned Part;
+
+ /// in [0..VF)
+ unsigned Lane;
+};
+
+/// This is a helper struct for maintaining vectorization state. It's used for
+/// mapping values from the original loop to their corresponding values in
+/// the new loop. Two mappings are maintained: one for vectorized values and
+/// one for scalarized values. Vectorized values are represented with UF
+/// vector values in the new loop, and scalarized values are represented with
+/// UF x VF scalar values in the new loop. UF and VF are the unroll and
+/// vectorization factors, respectively.
+///
+/// Entries can be added to either map with setVectorValue and setScalarValue,
+/// which assert that an entry was not already added before. If an entry is to
+/// replace an existing one, call resetVectorValue and resetScalarValue. This is
+/// currently needed to modify the mapped values during "fix-up" operations that
+/// occur once the first phase of widening is complete. These operations include
+/// type truncation and the second phase of recurrence widening.
+///
+/// Entries from either map can be retrieved using the getVectorValue and
+/// getScalarValue functions, which assert that the desired value exists.
+struct VectorizerValueMap {
+ friend struct VPTransformState;
+
+private:
+ /// The unroll factor. Each entry in the vector map contains UF vector values.
+ unsigned UF;
+
+ /// The vectorization factor. Each entry in the scalar map contains UF x VF
+ /// scalar values.
+ unsigned VF;
+
+ /// The vector and scalar map storage. We use std::map and not DenseMap
+ /// because insertions to DenseMap invalidate its iterators.
+ using VectorParts = SmallVector<Value *, 2>;
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+ std::map<Value *, VectorParts> VectorMapStorage;
+ std::map<Value *, ScalarParts> ScalarMapStorage;
+
+public:
+ /// Construct an empty map with the given unroll and vectorization factors.
+ VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+ /// \return True if the map has any vector entry for \p Key.
+ bool hasAnyVectorValue(Value *Key) const {
+ return VectorMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a vector entry for \p Key and \p Part.
+ bool hasVectorValue(Value *Key, unsigned Part) const {
+ assert(Part < UF && "Queried Vector Part is too large.");
+ if (!hasAnyVectorValue(Key))
+ return false;
+ const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+ return Entry[Part] != nullptr;
+ }
+
+ /// \return True if the map has any scalar entry for \p Key.
+ bool hasAnyScalarValue(Value *Key) const {
+ return ScalarMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a scalar entry for \p Key and \p Instance.
+ bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
+ assert(Instance.Part < UF && "Queried Scalar Part is too large.");
+ assert(Instance.Lane < VF && "Queried Scalar Lane is too large.");
+ if (!hasAnyScalarValue(Key))
+ return false;
+ const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+ assert(Entry[Instance.Part].size() == VF &&
+ "ScalarParts has wrong dimensions.");
+ return Entry[Instance.Part][Instance.Lane] != nullptr;
+ }
+
+ /// Retrieve the existing vector value that corresponds to \p Key and
+ /// \p Part.
+ Value *getVectorValue(Value *Key, unsigned Part) {
+ assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+ return VectorMapStorage[Key][Part];
+ }
+
+ /// Retrieve the existing scalar value that corresponds to \p Key and
+ /// \p Instance.
+ Value *getScalarValue(Value *Key, const VPIteration &Instance) {
+ assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
+ return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+ }
+
+ /// Set a vector value associated with \p Key and \p Part. Assumes such a
+ /// value is not already set. If it is, use resetVectorValue() instead.
+ void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+ if (!VectorMapStorage.count(Key)) {
+ VectorParts Entry(UF);
+ VectorMapStorage[Key] = Entry;
+ }
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
+ /// value is not already set.
+ void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
+ assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
+ if (!ScalarMapStorage.count(Key)) {
+ ScalarParts Entry(UF);
+ // TODO: Consider storing uniform values only per-part, as they occupy
+ // lane 0 only, keeping the other VF-1 redundant entries null.
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part].resize(VF, nullptr);
+ ScalarMapStorage[Key] = Entry;
+ }
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+
+ /// Reset the vector value associated with \p Key for the given \p Part.
+ /// This function can be used to update values that have already been
+ /// vectorized. This is the case for "fix-up" operations including type
+ /// truncation and the second phase of recurrence vectorization.
+ void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
+ /// This function can be used to update values that have already been
+ /// scalarized. This is the case for "fix-up" operations including scalar phi
+ /// nodes for scalarized and predicated instructions.
+ void resetScalarValue(Value *Key, const VPIteration &Instance,
+ Value *Scalar) {
+ assert(hasScalarValue(Key, Instance) &&
+ "Scalar value not set for part and lane");
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+};
+
+/// This class is used to enable the VPlan to invoke a method of ILV. This is
+/// needed until the method is refactored out of ILV and becomes reusable.
+struct VPCallback {
+ virtual ~VPCallback() {}
+ virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
+ VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT,
+ IRBuilder<> &Builder, VectorizerValueMap &ValueMap,
+ InnerLoopVectorizer *ILV, VPCallback &Callback)
+ : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder),
+ ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+
+ /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
+ unsigned VF;
+ unsigned UF;
+
+ /// Hold the indices to generate specific scalar instructions. Null indicates
+ /// that all instances are to be generated, using either scalar or vector
+ /// instructions.
+ Optional<VPIteration> Instance;
+
+ struct DataState {
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in
+ /// the new unrolled loop, where UF is the unroll factor.
+ typedef SmallVector<Value *, 2> PerPartValuesTy;
+
+ DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
+ } Data;
+
+ /// Get the generated Value for a given VPValue and a given Part. Note that
+ /// as some Defs are still created by ILV and managed in its ValueMap, this
+ /// method will delegate the call to ILV in such cases in order to provide
+ /// callers a consistent API.
+ /// \see set.
+ Value *get(VPValue *Def, unsigned Part) {
+ // If Values have been set for this Def return the one relevant for \p Part.
+ if (Data.PerPartOutput.count(Def))
+ return Data.PerPartOutput[Def][Part];
+ // Def is managed by ILV: bring the Values from ValueMap.
+ return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
+ }
+
+ /// Set the generated Value for a given VPValue and a given Part.
+ void set(VPValue *Def, Value *V, unsigned Part) {
+ if (!Data.PerPartOutput.count(Def)) {
+ DataState::PerPartValuesTy Entry(UF);
+ Data.PerPartOutput[Def] = Entry;
+ }
+ Data.PerPartOutput[Def][Part] = V;
+ }
+
+ /// Hold state information used when constructing the CFG of the output IR,
+ /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+ struct CFGState {
+ /// The previous VPBasicBlock visited. Initially set to null.
+ VPBasicBlock *PrevVPBB = nullptr;
+
+ /// The previous IR BasicBlock created or used. Initially set to the new
+ /// header BasicBlock.
+ BasicBlock *PrevBB = nullptr;
+
+ /// The last IR BasicBlock in the output IR. Set to the new latch
+ /// BasicBlock, used for placing the newly created BasicBlocks.
+ BasicBlock *LastBB = nullptr;
+
+ /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+ /// of replication, maps the BasicBlock of the last replica created.
+ SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+ CFGState() = default;
+ } CFG;
+
+ /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+ LoopInfo *LI;
+
+ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
+ DominatorTree *DT;
+
+ /// Hold a reference to the IRBuilder used to generate output IR code.
+ IRBuilder<> &Builder;
+
+ /// Hold a reference to the Value state information used when generating the
+ /// Values of the output IR.
+ VectorizerValueMap &ValueMap;
+
+ /// Hold a reference to a mapping between VPValues in VPlan and original
+ /// Values they correspond to.
+ VPValue2ValueTy VPValue2Value;
+
+ /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+ InnerLoopVectorizer *ILV;
+
+ VPCallback &Callback;
+};
+
+/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
+/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
+class VPBlockBase {
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// An optional name for the block.
+ std::string Name;
+
+ /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
+ /// it is a topmost VPBlockBase.
+ VPRegionBlock *Parent = nullptr;
+
+ /// List of predecessor blocks.
+ SmallVector<VPBlockBase *, 1> Predecessors;
+
+ /// List of successor blocks.
+ SmallVector<VPBlockBase *, 1> Successors;
+
+ /// Add \p Successor as the last successor to this block.
+ void appendSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Cannot add nullptr successor!");
+ Successors.push_back(Successor);
+ }
+
+ /// Add \p Predecessor as the last predecessor to this block.
+ void appendPredecessor(VPBlockBase *Predecessor) {
+ assert(Predecessor && "Cannot add nullptr predecessor!");
+ Predecessors.push_back(Predecessor);
+ }
+
+ /// Remove \p Predecessor from the predecessors of this block.
+ void removePredecessor(VPBlockBase *Predecessor) {
+ auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor);
+ assert(Pos && "Predecessor does not exist");
+ Predecessors.erase(Pos);
+ }
+
+ /// Remove \p Successor from the successors of this block.
+ void removeSuccessor(VPBlockBase *Successor) {
+ auto Pos = std::find(Successors.begin(), Successors.end(), Successor);
+ assert(Pos && "Successor does not exist");
+ Successors.erase(Pos);
+ }
+
+protected:
+ VPBlockBase(const unsigned char SC, const std::string &N)
+ : SubclassID(SC), Name(N) {}
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPBlockBase
+ /// that are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPBlockBase objects. They are used for concrete
+ /// type identification.
+ using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
+
+ using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
+
+ virtual ~VPBlockBase() = default;
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPBlockID() const { return SubclassID; }
+
+ const VPRegionBlock *getParent() const { return Parent; }
+
+ void setParent(VPRegionBlock *P) { Parent = P; }
+
+ /// \return the VPBasicBlock that is the entry of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getEntryBasicBlock() const;
+ VPBasicBlock *getEntryBasicBlock();
+
+ /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getExitBasicBlock() const;
+ VPBasicBlock *getExitBasicBlock();
+
+ const VPBlocksTy &getSuccessors() const { return Successors; }
+ VPBlocksTy &getSuccessors() { return Successors; }
+
+ const VPBlocksTy &getPredecessors() const { return Predecessors; }
+ VPBlocksTy &getPredecessors() { return Predecessors; }
+
+ /// \return the successor of this VPBlockBase if it has a single successor.
+ /// Otherwise return a null pointer.
+ VPBlockBase *getSingleSuccessor() const {
+ return (Successors.size() == 1 ? *Successors.begin() : nullptr);
+ }
+
+ /// \return the predecessor of this VPBlockBase if it has a single
+ /// predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSinglePredecessor() const {
+ return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
+ }
+
+ /// An Enclosing Block of a block B is any block containing B, including B
+ /// itself. \return the closest enclosing block starting from "this", which
+ /// has successors. \return the root enclosing block if all enclosing blocks
+ /// have no successors.
+ VPBlockBase *getEnclosingBlockWithSuccessors();
+
+ /// \return the closest enclosing block starting from "this", which has
+ /// predecessors. \return the root enclosing block if all enclosing blocks
+ /// have no predecessors.
+ VPBlockBase *getEnclosingBlockWithPredecessors();
+
+ /// \return the successors either attached directly to this VPBlockBase or, if
+ /// this VPBlockBase is the exit block of a VPRegionBlock and has no
+ /// successors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has successors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) successors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalSuccessors() {
+ return getEnclosingBlockWithSuccessors()->getSuccessors();
+ }
+
+ /// \return the hierarchical successor of this VPBlockBase if it has a single
+ /// hierarchical successor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalSuccessor() {
+ return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
+ }
+
+ /// \return the predecessors either attached directly to this VPBlockBase or,
+ /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
+ /// predecessors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has predecessors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) predecessors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalPredecessors() {
+ return getEnclosingBlockWithPredecessors()->getPredecessors();
+ }
+
+ /// \return the hierarchical predecessor of this VPBlockBase if it has a
+ /// single hierarchical predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalPredecessor() {
+ return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
+ }
+
+ /// Sets a given VPBlockBase \p Successor as the single successor and \return
+ /// \p Successor. The parent of this Block is copied to be the parent of
+ /// \p Successor.
+ VPBlockBase *setOneSuccessor(VPBlockBase *Successor) {
+ assert(Successors.empty() && "Setting one successor when others exist.");
+ appendSuccessor(Successor);
+ Successor->appendPredecessor(this);
+ Successor->Parent = Parent;
+ return Successor;
+ }
+
+ /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors. The parent of this Block is copied to be the parent of both
+ /// \p IfTrue and \p IfFalse.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
+ assert(Successors.empty() && "Setting two successors when others exist.");
+ appendSuccessor(IfTrue);
+ appendSuccessor(IfFalse);
+ IfTrue->appendPredecessor(this);
+ IfFalse->appendPredecessor(this);
+ IfTrue->Parent = Parent;
+ IfFalse->Parent = Parent;
+ }
+
+ void disconnectSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Successor to disconnect is null.");
+ removeSuccessor(Successor);
+ Successor->removePredecessor(this);
+ }
+
+ /// The method which generates the output IR that correspond to this
+ /// VPBlockBase, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState *State) = 0;
+
+ /// Delete all blocks reachable from a given VPBlockBase, inclusive.
+ static void deleteCFG(VPBlockBase *Entry);
+};
+
+/// VPRecipeBase is a base class modeling a sequence of one or more output IR
+/// instructions.
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
+ friend VPBasicBlock;
+
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// Each VPRecipe belongs to a single VPBasicBlock.
+ VPBasicBlock *Parent = nullptr;
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
+ /// that is actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPRecipeBase objects. They are used for concrete
+ /// type identification.
+ using VPRecipeTy = enum {
+ VPBlendSC,
+ VPBranchOnMaskSC,
+ VPInstructionSC,
+ VPInterleaveSC,
+ VPPredInstPHISC,
+ VPReplicateSC,
+ VPWidenIntOrFpInductionSC,
+ VPWidenMemoryInstructionSC,
+ VPWidenPHISC,
+ VPWidenSC,
+ };
+
+ VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
+ virtual ~VPRecipeBase() = default;
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPRecipeID() const { return SubclassID; }
+
+ /// \return the VPBasicBlock which this VPRecipe belongs to.
+ VPBasicBlock *getParent() { return Parent; }
+ const VPBasicBlock *getParent() const { return Parent; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRecipe, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState &State) = 0;
+
+ /// Each recipe prints itself.
+ virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+};
+
+/// This is a concrete Recipe that models a single VPlan-level instruction.
+/// While as any Recipe it may generate a sequence of IR instructions when
+/// executed, these instructions would always form a single-def expression as
+/// the VPInstruction is also a single def-use vertex.
+class VPInstruction : public VPUser, public VPRecipeBase {
+public:
+ /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
+ enum { Not = Instruction::OtherOpsEnd + 1 };
+
+private:
+ typedef unsigned char OpcodeTy;
+ OpcodeTy Opcode;
+
+ /// Utility method serving execute(): generates a single instance of the
+ /// modeled instruction.
+ void generateInstruction(VPTransformState &State, unsigned Part);
+
+public:
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ : VPUser(VPValue::VPInstructionSC, Operands),
+ VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPInstructionSC;
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
+ }
+
+ unsigned getOpcode() const { return Opcode; }
+
+ /// Generate the instruction.
+ /// TODO: We currently execute only per-part unless a specific instance is
+ /// provided.
+ void execute(VPTransformState &State) override;
+
+ /// Print the Recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+
+ /// Print the VPInstruction.
+ void print(raw_ostream &O) const;
+};
+
+/// VPWidenRecipe is a recipe for producing a copy of vector type for each
+/// Instruction in its ingredients independently, in order. This recipe covers
+/// most of the traditional vectorization cases where each ingredient transforms
+/// into a vectorized version of itself.
+class VPWidenRecipe : public VPRecipeBase {
+private:
+ /// Hold the ingredients by pointing to their original BasicBlock location.
+ BasicBlock::iterator Begin;
+ BasicBlock::iterator End;
+
+public:
+ VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
+ End = I->getIterator();
+ Begin = End++;
+ }
+
+ ~VPWidenRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
+ }
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override;
+
+ /// Augment the recipe to include Instr, if it lies at its End.
+ bool appendInstruction(Instruction *Instr) {
+ if (End != Instr->getIterator())
+ return false;
+ End++;
+ return true;
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector and scalar values.
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+private:
+ PHINode *IV;
+ TruncInst *Trunc;
+
+public:
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
+ ~VPWidenIntOrFpInductionRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+ }
+
+ /// Generate the vectorized and scalarized versions of the phi node as
+ /// needed by their users.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for handling all phi nodes except for integer and FP inductions.
+class VPWidenPHIRecipe : public VPRecipeBase {
+private:
+ PHINode *Phi;
+
+public:
+ VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
+ ~VPWidenPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
+class VPBlendRecipe : public VPRecipeBase {
+private:
+ PHINode *Phi;
+
+ /// The blend operation is a User of a mask, if not null.
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks)
+ : VPRecipeBase(VPBlendSC), Phi(Phi) {
+ assert((Phi->getNumIncomingValues() == 1 ||
+ Phi->getNumIncomingValues() == Masks.size()) &&
+ "Expected the same number of incoming values and masks");
+ if (!Masks.empty())
+ User.reset(new VPUser(Masks));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles.
+class VPInterleaveRecipe : public VPRecipeBase {
+private:
+ const InterleaveGroup *IG;
+
+public:
+ VPInterleaveRecipe(const InterleaveGroup *IG)
+ : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+ ~VPInterleaveRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
+ }
+
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+
+ const InterleaveGroup *getInterleaveGroup() { return IG; }
+};
+
+/// VPReplicateRecipe replicates a given instruction producing multiple scalar
+/// copies of the original scalar type, one per lane, instead of producing a
+/// single copy of widened type for all lanes. If the instruction is known to be
+/// uniform only one copy, per lane zero, will be generated.
+class VPReplicateRecipe : public VPRecipeBase {
+private:
+ /// The instruction being replicated.
+ Instruction *Ingredient;
+
+ /// Indicator if only a single replica per lane is needed.
+ bool IsUniform;
+
+ /// Indicator if the replicas are also predicated.
+ bool IsPredicated;
+
+ /// Indicator if the scalar values should also be packed into a vector.
+ bool AlsoPack;
+
+public:
+ VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
+ : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
+ IsPredicated(IsPredicated) {
+ // Retain the previous behavior of predicateInstructions(), where an
+ // insert-element of a predicated instruction got hoisted into the
+ // predicated basic block iff it was its only user. This is achieved by
+ // having predicated instructions also pack their values into a vector by
+ // default unless they have a replicated user which uses their scalar value.
+ AlsoPack = IsPredicated && !I->use_empty();
+ }
+
+ ~VPReplicateRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
+ }
+
+ /// Generate replicas of the desired Ingredient. Replicas will be generated
+ /// for all parts and lanes unless a specific part and lane are specified in
+ /// the \p State.
+ void execute(VPTransformState &State) override;
+
+ void setAlsoPack(bool Pack) { AlsoPack = Pack; }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A recipe for generating conditional branches on the bits of a mask.
+class VPBranchOnMaskRecipe : public VPRecipeBase {
+private:
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
+ if (BlockInMask) // nullptr means all-one mask.
+ User.reset(new VPUser({BlockInMask}));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
+ }
+
+ /// Generate the extraction of the appropriate bit from the block mask and the
+ /// conditional branch.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+ if (User)
+ O << *User->getOperand(0);
+ else
+ O << " All-One";
+ O << "\\l\"";
+ }
+};
+
+/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
+/// control converges back from a Branch-on-Mask. The phi nodes are needed in
+/// order to merge values that are set under such a branch and feed their uses.
+/// The phi nodes can be scalar or vector depending on the users of the value.
+/// This recipe works in concert with VPBranchOnMaskRecipe.
+class VPPredInstPHIRecipe : public VPRecipeBase {
+private:
+ Instruction *PredInst;
+
+public:
+ /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
+ /// nodes after merging back from a Branch-on-Mask.
+ VPPredInstPHIRecipe(Instruction *PredInst)
+ : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
+ ~VPPredInstPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
+ }
+
+ /// Generates phi nodes for live-outs as needed to retain SSA form.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// A Recipe for widening load/store operations.
+/// TODO: We currently execute only per-part unless a specific instance is
+/// provided.
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+private:
+ Instruction &Instr;
+ std::unique_ptr<VPUser> User;
+
+public:
+ VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask)
+ : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {
+ if (Mask) // Create a VPInstruction to register as a user of the mask.
+ User.reset(new VPUser({Mask}));
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+ }
+
+ /// Generate the wide load/store.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
+/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
+/// holds a sequence of zero or more VPRecipe's each representing a sequence of
+/// output IR instructions.
+class VPBasicBlock : public VPBlockBase {
+public:
+ using RecipeListTy = iplist<VPRecipeBase>;
+
+private:
+ /// The VPRecipes held in the order of output instructions to generate.
+ RecipeListTy Recipes;
+
+public:
+ VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+ : VPBlockBase(VPBasicBlockSC, Name.str()) {
+ if (Recipe)
+ appendRecipe(Recipe);
+ }
+
+ ~VPBasicBlock() override { Recipes.clear(); }
+
+ /// Instruction iterators...
+ using iterator = RecipeListTy::iterator;
+ using const_iterator = RecipeListTy::const_iterator;
+ using reverse_iterator = RecipeListTy::reverse_iterator;
+ using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
+
+ //===--------------------------------------------------------------------===//
+ /// Recipe iterator methods
+ ///
+ inline iterator begin() { return Recipes.begin(); }
+ inline const_iterator begin() const { return Recipes.begin(); }
+ inline iterator end() { return Recipes.end(); }
+ inline const_iterator end() const { return Recipes.end(); }
+
+ inline reverse_iterator rbegin() { return Recipes.rbegin(); }
+ inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
+ inline reverse_iterator rend() { return Recipes.rend(); }
+ inline const_reverse_iterator rend() const { return Recipes.rend(); }
+
+ inline size_t size() const { return Recipes.size(); }
+ inline bool empty() const { return Recipes.empty(); }
+ inline const VPRecipeBase &front() const { return Recipes.front(); }
+ inline VPRecipeBase &front() { return Recipes.front(); }
+ inline const VPRecipeBase &back() const { return Recipes.back(); }
+ inline VPRecipeBase &back() { return Recipes.back(); }
+
+ /// \brief Returns a pointer to a member of the recipe list.
+ static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
+ return &VPBasicBlock::Recipes;
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
+ }
+
+ void insert(VPRecipeBase *Recipe, iterator InsertPt) {
+ assert(Recipe && "No recipe to append.");
+ assert(!Recipe->Parent && "Recipe already in VPlan");
+ Recipe->Parent = this;
+ Recipes.insert(InsertPt, Recipe);
+ }
+
+ /// Augment the existing recipes of a VPBasicBlock with an additional
+ /// \p Recipe as the last recipe.
+ void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPBasicBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+
+private:
+ /// Create an IR BasicBlock to hold the output instructions generated by this
+ /// VPBasicBlock, and return it. Update the CFGState accordingly.
+ BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+};
+
+/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
+/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// A VPRegionBlock may indicate that its contents are to be replicated several
+/// times. This is designed to support predicated scalarization, in which a
+/// scalar if-then code structure needs to be generated VF * UF times. Having
+/// this replication indicator helps to keep a single model for multiple
+/// candidate VF's. The actual replication takes place only once the desired VF
+/// and UF have been determined.
+class VPRegionBlock : public VPBlockBase {
+private:
+ /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Entry;
+
+ /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Exit;
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool IsReplicator;
+
+public:
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+ const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+ IsReplicator(IsReplicator) {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exit->setParent(this);
+ }
+
+ ~VPRegionBlock() override {
+ if (Entry)
+ deleteCFG(Entry);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
+ }
+
+ const VPBlockBase *getEntry() const { return Entry; }
+ VPBlockBase *getEntry() { return Entry; }
+
+ const VPBlockBase *getExit() const { return Exit; }
+ VPBlockBase *getExit() { return Exit; }
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool isReplicator() const { return IsReplicator; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRegionBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+};
+
+/// VPlan models a candidate for vectorization, encoding various decisions take
+/// to produce efficient output IR, including which branches, basic-blocks and
+/// output IR instructions to generate, and their cost. VPlan holds a
+/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
+/// VPBlock.
+class VPlan {
+ friend class VPlanPrinter;
+
+private:
+ /// Hold the single entry to the Hierarchical CFG of the VPlan.
+ VPBlockBase *Entry;
+
+ /// Holds the VFs applicable to this VPlan.
+ SmallSet<unsigned, 2> VFs;
+
+ /// Holds the name of the VPlan, for printing.
+ std::string Name;
+
+ /// Holds a mapping between Values and their corresponding VPValue inside
+ /// VPlan.
+ Value2VPValueTy Value2VPValue;
+
+public:
+ VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+
+ ~VPlan() {
+ if (Entry)
+ VPBlockBase::deleteCFG(Entry);
+ for (auto &MapEntry : Value2VPValue)
+ delete MapEntry.second;
+ }
+
+ /// Generate the IR code for this VPlan.
+ void execute(struct VPTransformState *State);
+
+ VPBlockBase *getEntry() { return Entry; }
+ const VPBlockBase *getEntry() const { return Entry; }
+
+ VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+
+ void addVF(unsigned VF) { VFs.insert(VF); }
+
+ bool hasVF(unsigned VF) { return VFs.count(VF); }
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ void addVPValue(Value *V) {
+ assert(V && "Trying to add a null Value to VPlan");
+ assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
+ Value2VPValue[V] = new VPValue();
+ }
+
+ VPValue *getVPValue(Value *V) {
+ assert(V && "Trying to get the VPValue of a null Value");
+ assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
+ return Value2VPValue[V];
+ }
+
+private:
+ /// Add to the given dominator tree the header block and every new basic block
+ /// that was created between it and the latch block, inclusive.
+ static void updateDominatorTree(DominatorTree *DT,
+ BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB);
+};
+
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+ friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const struct VPlanIngredient &I);
+
+private:
+ raw_ostream &OS;
+ VPlan &Plan;
+ unsigned Depth;
+ unsigned TabWidth = 2;
+ std::string Indent;
+ unsigned BID = 0;
+ SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+ VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}
+
+ /// Handle indentation.
+ void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+ /// Print a given \p Block of the Plan.
+ void dumpBlock(const VPBlockBase *Block);
+
+ /// Print the information related to the CFG edges going out of a given
+ /// \p Block, followed by printing the successor blocks themselves.
+ void dumpEdges(const VPBlockBase *Block);
+
+ /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+ /// its successor blocks.
+ void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+ /// Print a given \p Region of the Plan.
+ void dumpRegion(const VPRegionBlock *Region);
+
+ unsigned getOrCreateBID(const VPBlockBase *Block) {
+ return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+ }
+
+ const Twine getOrCreateName(const VPBlockBase *Block);
+
+ const Twine getUID(const VPBlockBase *Block);
+
+ /// Print the information related to a CFG edge between two VPBlockBases.
+ void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+ const Twine &Label);
+
+ void dump();
+
+ static void printAsIngredient(raw_ostream &O, Value *V);
+};
+
+struct VPlanIngredient {
+ Value *V;
+
+ VPlanIngredient(Value *V) : V(V) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
+ VPlanPrinter::printAsIngredient(OS, I.V);
+ return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {
+ VPlanPrinter Printer(OS, Plan);
+ Printer.dump();
+ return OS;
+}
+
+//===--------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs //
+//===--------------------------------------------------------------------===//
+
+// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a
+// graph of VPBlockBase nodes...
+
+template <> struct GraphTraits<VPBlockBase *> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+ using NodeRef = const VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a
+// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order
+// for a VPBlockBase is considered to be when traversing the predecessors of a
+// VPBlockBase instead of its successors.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static Inverse<VPBlockBase *> getEntryNode(Inverse<VPBlockBase *> B) {
+ return B;
+ }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getPredecessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getPredecessors().end();
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h
new file mode 100644
index 000000000000..d6eb3397d044
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanBuilder.h
@@ -0,0 +1,61 @@
+//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
+
+#include "VPlan.h"
+
+namespace llvm {
+
+class VPBuilder {
+private:
+ VPBasicBlock *BB = nullptr;
+ VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ BB->insert(Instr, InsertPt);
+ return Instr;
+ }
+
+public:
+ VPBuilder() {}
+
+ /// \brief This specifies that created VPInstructions should be appended to
+ /// the end of the specified block.
+ void setInsertPoint(VPBasicBlock *TheBB) {
+ assert(TheBB && "Attempting to set a null insert point");
+ BB = TheBB;
+ InsertPt = BB->end();
+ }
+
+ VPValue *createNot(VPValue *Operand) {
+ return createInstruction(VPInstruction::Not, {Operand});
+ }
+
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ }
+
+ VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h
new file mode 100644
index 000000000000..50966891e0eb
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanValue.h
@@ -0,0 +1,146 @@
+//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the entities induced by Vectorization
+/// Plans, e.g. the instructions the VPlan intends to generate if executed.
+/// VPlan models the following entities:
+/// VPValue
+/// |-- VPUser
+/// | |-- VPInstruction
+/// These are documented in docs/VectorizationPlan.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+// Forward declarations.
+class VPUser;
+
+// This is the base class of the VPlan Def/Use graph, used for modeling the data
+// flow into, within and out of the VPlan. VPValues can stand for live-ins
+// coming from the input IR, instructions which VPlan will generate if executed
+// and live-outs which the VPlan will need to fix accordingly.
+class VPValue {
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ SmallVector<VPUser *, 1> Users;
+
+protected:
+ VPValue(const unsigned char SC) : SubclassID(SC) {}
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPValue that
+ /// are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPValue objects. They are used for concrete
+ /// type identification.
+ enum { VPValueSC, VPUserSC, VPInstructionSC };
+
+ VPValue() : SubclassID(VPValueSC) {}
+ VPValue(const VPValue &) = delete;
+ VPValue &operator=(const VPValue &) = delete;
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPValueID() const { return SubclassID; }
+
+ void printAsOperand(raw_ostream &OS) const {
+ OS << "%vp" << (unsigned short)(unsigned long long)this;
+ }
+
+ unsigned getNumUsers() const { return Users.size(); }
+ void addUser(VPUser &User) { Users.push_back(&User); }
+
+ typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
+ typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
+ typedef iterator_range<user_iterator> user_range;
+ typedef iterator_range<const_user_iterator> const_user_range;
+
+ user_iterator user_begin() { return Users.begin(); }
+ const_user_iterator user_begin() const { return Users.begin(); }
+ user_iterator user_end() { return Users.end(); }
+ const_user_iterator user_end() const { return Users.end(); }
+ user_range users() { return user_range(user_begin(), user_end()); }
+ const_user_range users() const {
+ return const_user_range(user_begin(), user_end());
+ }
+};
+
+typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
+typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
+
+raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
+
+/// This class augments VPValue with operands which provide the inverse def-use
+/// edges from VPValue's users to their defs.
+class VPUser : public VPValue {
+private:
+ SmallVector<VPValue *, 2> Operands;
+
+ void addOperand(VPValue *Operand) {
+ Operands.push_back(Operand);
+ Operand->addUser(*this);
+ }
+
+protected:
+ VPUser(const unsigned char SC) : VPValue(SC) {}
+ VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
+ for (VPValue *Operand : Operands)
+ addOperand(Operand);
+ }
+
+public:
+ VPUser() : VPValue(VPValue::VPUserSC) {}
+ VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
+ VPUser(std::initializer_list<VPValue *> Operands)
+ : VPUser(ArrayRef<VPValue *>(Operands)) {}
+ VPUser(const VPUser &) = delete;
+ VPUser &operator=(const VPUser &) = delete;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() >= VPUserSC &&
+ V->getVPValueID() <= VPInstructionSC;
+ }
+
+ unsigned getNumOperands() const { return Operands.size(); }
+ inline VPValue *getOperand(unsigned N) const {
+ assert(N < Operands.size() && "Operand index out of bounds");
+ return Operands[N];
+ }
+
+ typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
+ typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
+ typedef iterator_range<operand_iterator> operand_range;
+ typedef iterator_range<const_operand_iterator> const_operand_range;
+
+ operand_iterator op_begin() { return Operands.begin(); }
+ const_operand_iterator op_begin() const { return Operands.begin(); }
+ operand_iterator op_end() { return Operands.end(); }
+ const_operand_iterator op_end() const { return Operands.end(); }
+ operand_range operands() { return operand_range(op_begin(), op_end()); }
+ const_operand_range operands() const {
+ return const_operand_range(op_begin(), op_end());
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index fb2f509dcbaa..b04905bfc6fa 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -18,7 +18,6 @@
#include "llvm-c/Transforms/Vectorize.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
using namespace llvm;