summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp2577
1 files changed, 1656 insertions, 921 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 012b10c8a9b00..fbcdc0df0f1c4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -47,62 +47,97 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "VPlan.h"
+#include "VPlanBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
-#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
-#include <map>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
#include <tuple>
+#include <utility>
+#include <vector>
using namespace llvm;
-using namespace llvm::PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -245,12 +280,12 @@ createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
namespace {
-// Forward declarations.
-class LoopVectorizeHints;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
+} // end anonymous namespace
+
/// Returns true if the given loop body has a cycle, excluding the loop
/// itself.
static bool hasCyclesInLoopBody(const Loop &L) {
@@ -324,7 +359,6 @@ static unsigned getMemInstAddressSpace(Value *I) {
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
-
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
@@ -349,7 +383,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; }
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
@@ -362,6 +396,8 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
: ConstantFP::get(Ty, C);
}
+namespace llvm {
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -387,16 +423,16 @@ public:
LoopVectorizationCostModel *CM)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
- Builder(PSE.getSE()->getContext()), Induction(nullptr),
- OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
- TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
- AddedSafetyChecks(false) {}
+ Builder(PSE.getSE()->getContext()),
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ virtual ~InnerLoopVectorizer() = default;
/// Create a new empty loop. Unlink the old loop and connect the new one.
- void createVectorizedLoopSkeleton();
+ /// Return the pre-header block of the new loop.
+ BasicBlock *createVectorizedLoopSkeleton();
- /// Vectorize a single instruction within the innermost loop.
- void vectorizeInstruction(Instruction &I);
+ /// Widen a single instruction within the innermost loop.
+ void widenInstruction(Instruction &I);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();
@@ -404,28 +440,83 @@ public:
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
- virtual ~InnerLoopVectorizer() {}
-
-protected:
- /// A small list of PHINodes.
- typedef SmallVector<PHINode *, 4> PhiVector;
-
/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
- typedef SmallVector<Value *, 2> VectorParts;
+ using VectorParts = SmallVector<Value *, 2>;
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+
+ /// A helper function to scalarize a single Instruction in the innermost loop.
+ /// Generates a sequence of scalar instances for each lane between \p MinLane
+ /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+ /// inclusive..
+ void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
+ bool IfPredicateInstr);
+
+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+ /// is provided, the integer induction variable will first be truncated to
+ /// the corresponding type.
+ void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+ /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+ /// vector or scalar value on-demand if one is not yet available. When
+ /// vectorizing a loop, we visit the definition of an instruction before its
+ /// uses. When visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
+ ///
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part. If the value has already been vectorized,
+ /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// a new vector value on-demand by inserting the scalar values into a vector
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into a vector.
+ Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll and vector indices \p Instance. If the value has been
+ /// vectorized but not scalarized, the necessary extractelement instruction
+ /// will be generated.
+ Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+ /// Try to vectorize the interleaved access group that \p Instr belongs to.
+ void vectorizeInterleaveGroup(Instruction *Instr);
+
+ /// Vectorize Load and Store instructions, optionally masking the vector
+ /// operations if \p BlockInMask is non-null.
+ void vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
+
+ /// \brief Set the debug location in the builder using the debug location in
+ /// the instruction.
+ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+protected:
+ friend class LoopVectorizationPlanner;
+
+ /// A small list of PHINodes.
+ using PhiVector = SmallVector<PHINode *, 4>;
/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// vectorization factor.
- typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
-
- // When we if-convert we need to create edge masks. We have to cache values
- // so that we don't end up with exponential recursion/IR.
- typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
- EdgeMaskCacheTy;
- typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
@@ -457,40 +548,14 @@ protected:
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
- /// Predicate conditional instructions that require predication on their
- /// respective conditions.
- void predicateInstructions();
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VectorParts createBlockInMask(BasicBlock *BB);
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
- /// Vectorize a single PHINode in a block. This method handles the induction
- /// variable canonicalization. It supports both VF = 1 for unrolled loops and
- /// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
-
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
- /// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
- /// scalarized instruction behind an if block predicated on the control
- /// dependence of the instruction.
- void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
-
- /// Vectorize Load and Store instructions,
- virtual void vectorizeMemoryInstruction(Instruction *Instr);
-
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -521,11 +586,6 @@ protected:
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Instruction *EntryVal);
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
-
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;
@@ -533,37 +593,19 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
- /// vector or scalar value on-demand if one is not yet available. When
- /// vectorizing a loop, we visit the definition of an instruction before its
- /// uses. When visiting the definition, we either vectorize or scalarize the
- /// instruction, creating an entry for it in the corresponding map. (In some
- /// cases, such as induction variables, we will create both vector and scalar
- /// entries.) Then, as we encounter uses of the definition, we derive values
- /// for each scalar or vector use unless such a value is already available.
- /// For example, if we scalarize a definition and one of its uses is vector,
- /// we build the required vector on-demand with an insertelement sequence
- /// when visiting the use. Otherwise, if the use is scalar, we can use the
- /// existing scalar definition.
- ///
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part. If the value has already been vectorized,
- /// the corresponding vector entry in VectorLoopValueMap is returned. If,
- /// however, the value has a scalar entry in VectorLoopValueMap, we construct
- /// a new vector value on-demand by inserting the scalar values into a vector
- /// with an insertelement sequence. If the value has been neither vectorized
- /// nor scalarized, it must be loop invariant, so we simply broadcast the
- /// value into a vector.
- Value *getOrCreateVectorValue(Value *V, unsigned Part);
-
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part and vector index \p Lane. If the value has
- /// been vectorized but not scalarized, the necessary extractelement
- /// instruction will be generated.
- Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
-
- /// Try to vectorize the interleaved access group that \p Instr belongs to.
- void vectorizeInterleaveGroup(Instruction *Instr);
+ /// If there is a cast involved in the induction variable \p ID, which should
+ /// be ignored in the vectorized loop body, this function records the
+ /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
+ /// cast. We had already proved that the casted Phi is equal to the uncasted
+ /// Phi in the vectorized loop (under a runtime guard), and therefore
+ /// there is no need to vectorize the cast - the same value can be used in the
+ /// vector loop for both the Phi and the cast.
+ /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
+ /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
+ void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -574,12 +616,19 @@ protected:
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(Loop *NewLoop);
+ /// Returns a bitcasted value to the requested vector type.
+ /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
+ Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL);
+
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+
/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+
/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
@@ -601,149 +650,32 @@ protected:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
- /// \brief Set the debug location in the builder using the debug location in
- /// the instruction.
- void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
-
- /// This is a helper class for maintaining vectorization state. It's used for
- /// mapping values from the original loop to their corresponding values in
- /// the new loop. Two mappings are maintained: one for vectorized values and
- /// one for scalarized values. Vectorized values are represented with UF
- /// vector values in the new loop, and scalarized values are represented with
- /// UF x VF scalar values in the new loop. UF and VF are the unroll and
- /// vectorization factors, respectively.
- ///
- /// Entries can be added to either map with setVectorValue and setScalarValue,
- /// which assert that an entry was not already added before. If an entry is to
- /// replace an existing one, call resetVectorValue. This is currently needed
- /// to modify the mapped values during "fix-up" operations that occur once the
- /// first phase of widening is complete. These operations include type
- /// truncation and the second phase of recurrence widening.
- ///
- /// Entries from either map can be retrieved using the getVectorValue and
- /// getScalarValue functions, which assert that the desired value exists.
-
- struct ValueMap {
-
- /// Construct an empty map with the given unroll and vectorization factors.
- ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
-
- /// \return True if the map has any vector entry for \p Key.
- bool hasAnyVectorValue(Value *Key) const {
- return VectorMapStorage.count(Key);
- }
-
- /// \return True if the map has a vector entry for \p Key and \p Part.
- bool hasVectorValue(Value *Key, unsigned Part) const {
- assert(Part < UF && "Queried Vector Part is too large.");
- if (!hasAnyVectorValue(Key))
- return false;
- const VectorParts &Entry = VectorMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
- return Entry[Part] != nullptr;
- }
-
- /// \return True if the map has any scalar entry for \p Key.
- bool hasAnyScalarValue(Value *Key) const {
- return ScalarMapStorage.count(Key);
- }
-
- /// \return True if the map has a scalar entry for \p Key, \p Part and
- /// \p Part.
- bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
- assert(Part < UF && "Queried Scalar Part is too large.");
- assert(Lane < VF && "Queried Scalar Lane is too large.");
- if (!hasAnyScalarValue(Key))
- return false;
- const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
- assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
- return Entry[Part][Lane] != nullptr;
- }
-
- /// Retrieve the existing vector value that corresponds to \p Key and
- /// \p Part.
- Value *getVectorValue(Value *Key, unsigned Part) {
- assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
- return VectorMapStorage[Key][Part];
- }
-
- /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
- /// and \p Lane.
- Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
- assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
- return ScalarMapStorage[Key][Part][Lane];
- }
-
- /// Set a vector value associated with \p Key and \p Part. Assumes such a
- /// value is not already set. If it is, use resetVectorValue() instead.
- void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
- if (!VectorMapStorage.count(Key)) {
- VectorParts Entry(UF);
- VectorMapStorage[Key] = Entry;
- }
- VectorMapStorage[Key][Part] = Vector;
- }
-
- /// Set a scalar value associated with \p Key for \p Part and \p Lane.
- /// Assumes such a value is not already set.
- void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
- Value *Scalar) {
- assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
- if (!ScalarMapStorage.count(Key)) {
- ScalarParts Entry(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part].resize(VF, nullptr);
- // TODO: Consider storing uniform values only per-part, as they occupy
- // lane 0 only, keeping the other VF-1 redundant entries null.
- ScalarMapStorage[Key] = Entry;
- }
- ScalarMapStorage[Key][Part][Lane] = Scalar;
- }
-
- /// Reset the vector value associated with \p Key for the given \p Part.
- /// This function can be used to update values that have already been
- /// vectorized. This is the case for "fix-up" operations including type
- /// truncation and the second phase of recurrence vectorization.
- void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(hasVectorValue(Key, Part) && "Vector value not set for part");
- VectorMapStorage[Key][Part] = Vector;
- }
-
- private:
- /// The unroll factor. Each entry in the vector map contains UF vector
- /// values.
- unsigned UF;
-
- /// The vectorization factor. Each entry in the scalar map contains UF x VF
- /// scalar values.
- unsigned VF;
-
- /// The vector and scalar map storage. We use std::map and not DenseMap
- /// because insertions to DenseMap invalidate its iterators.
- std::map<Value *, VectorParts> VectorMapStorage;
- std::map<Value *, ScalarParts> ScalarMapStorage;
- };
-
/// The original loop.
Loop *OrigLoop;
+
/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// dynamic knowledge to simplify SCEV expressions and converts them to a
/// more usable form.
PredicatedScalarEvolution &PSE;
+
/// Loop Info.
LoopInfo *LI;
+
/// Dominator Tree.
DominatorTree *DT;
+
/// Alias Analysis.
AliasAnalysis *AA;
+
/// Target Library Info.
const TargetLibraryInfo *TLI;
+
/// Target Transform Info.
const TargetTransformInfo *TTI;
+
/// Assumption Cache.
AssumptionCache *AC;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
@@ -758,7 +690,6 @@ protected:
/// vector elements.
unsigned VF;
-protected:
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -770,39 +701,45 @@ protected:
/// The vector-loop preheader.
BasicBlock *LoopVectorPreHeader;
+
/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader;
+
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
+
/// The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;
+
/// The vector loop body.
BasicBlock *LoopVectorBody;
+
/// The scalar loop body.
BasicBlock *LoopScalarBody;
+
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block.
- PHINode *Induction;
+ PHINode *Induction = nullptr;
+
/// The induction variable of the old basic block.
- PHINode *OldInduction;
+ PHINode *OldInduction = nullptr;
/// Maps values from the original loop to their corresponding values in the
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
- ValueMap VectorLoopValueMap;
+ VectorizerValueMap VectorLoopValueMap;
+
+ /// Store instructions that were predicated.
+ SmallVector<Instruction *, 4> PredicatedInstructions;
- /// Store instructions that should be predicated, as a pair
- /// <StoreInst, Predicate>
- SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
/// Trip count of the original loop.
- Value *TripCount;
+ Value *TripCount = nullptr;
+
/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
- Value *VectorTripCount;
+ Value *VectorTripCount = nullptr;
/// The legality analysis.
LoopVectorizationLegality *Legal;
@@ -811,7 +748,7 @@ protected:
LoopVectorizationCostModel *Cost;
// Record whether runtime checks are added.
- bool AddedSafetyChecks;
+ bool AddedSafetyChecks = false;
// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
@@ -831,7 +768,6 @@ public:
UnrollFactor, LVL, CM) {}
private:
- void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
@@ -839,6 +775,8 @@ private:
Value *reverseVector(Value *Vec) override;
};
+} // end namespace llvm
+
/// \brief Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
@@ -861,7 +799,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
const DILocation *DIL = Inst->getDebugLoc();
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst))
B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
else
B.SetCurrentDebugLocation(DIL);
@@ -908,6 +847,8 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
}
}
+namespace llvm {
+
/// \brief The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
@@ -937,7 +878,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
class InterleaveGroup {
public:
InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
- : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
+ : Align(Align), InsertPos(Instr) {
assert(Align && "The alignment should be non-zero");
Factor = std::abs(Stride);
@@ -1010,13 +951,26 @@ public:
Instruction *getInsertPos() const { return InsertPos; }
void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+ /// Add metadata (e.g. alias info) from the instructions in this group to \p
+ /// NewInst.
+ ///
+ /// FIXME: this function currently does not add noalias metadata a'la
+ /// addNewMedata. To do that we need to compute the intersection of the
+ /// noalias info from all members.
+ void addMetadata(Instruction *NewInst) const {
+ SmallVector<Value *, 4> VL;
+ std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
+ [](std::pair<int, Instruction *> p) { return p.second; });
+ propagateMetadata(NewInst, VL);
+ }
+
private:
unsigned Factor; // Interleave Factor.
bool Reverse;
unsigned Align;
DenseMap<int, Instruction *> Members;
- int SmallestKey;
- int LargestKey;
+ int SmallestKey = 0;
+ int LargestKey = 0;
// To avoid breaking dependences, vectorized instructions of an interleave
// group should be inserted at either the first load or the last store in
@@ -1031,6 +985,9 @@ private:
// store i32 %odd // Insert Position
Instruction *InsertPos;
};
+} // end namespace llvm
+
+namespace {
/// \brief Drive the analysis of interleaved memory accesses in the loop.
///
@@ -1044,8 +1001,7 @@ class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
DominatorTree *DT, LoopInfo *LI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
- RequiresScalarEpilogue(false) {}
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
@@ -1065,14 +1021,6 @@ public:
return InterleaveGroupMap.count(Instr);
}
- /// \brief Return the maximum interleave factor of all interleaved groups.
- unsigned getMaxInterleaveFactor() const {
- unsigned MaxFactor = 1;
- for (auto &Entry : InterleaveGroupMap)
- MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
- return MaxFactor;
- }
-
/// \brief Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
@@ -1095,15 +1043,16 @@ private:
/// The interleaved access analysis can also add new predicates (for example
/// by versioning strides of pointers).
PredicatedScalarEvolution &PSE;
+
Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
- const LoopAccessInfo *LAI;
+ const LoopAccessInfo *LAI = nullptr;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
/// out-of-bounds by executing at least one scalar epilogue iteration.
- bool RequiresScalarEpilogue;
+ bool RequiresScalarEpilogue = false;
/// Holds the relationships between the members and the interleave group.
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
@@ -1114,21 +1063,26 @@ private:
/// \brief The descriptor for a strided memory access.
struct StrideDescriptor {
+ StrideDescriptor() = default;
StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
unsigned Align)
: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
- StrideDescriptor() = default;
-
// The access's stride. It is negative for a reverse access.
int64_t Stride = 0;
- const SCEV *Scev = nullptr; // The scalar expression of this access
- uint64_t Size = 0; // The size of the memory object.
- unsigned Align = 0; // The alignment of this access.
+
+ // The scalar expression of this access.
+ const SCEV *Scev = nullptr;
+
+ // The size of the memory object.
+ uint64_t Size = 0;
+
+ // The alignment of this access.
+ unsigned Align = 0;
};
/// \brief A type for holding instructions and their stride descriptors.
- typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+ using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
/// \brief Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
@@ -1179,7 +1133,6 @@ private:
/// not necessary or is prevented because \p A and \p B may be dependent.
bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
StrideEntry *B) const {
-
// Code motion for interleaved accesses can potentially hoist strided loads
// and sink strided stores. The code below checks the legality of the
// following two conditions:
@@ -1244,7 +1197,7 @@ private:
/// for example 'force', means a decision has been made. So, we need to be
/// careful NOT to add them if the user hasn't specifically asked so.
class LoopVectorizeHints {
- enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
+ enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
/// Hint - associates name and validation with the hint value.
struct Hint {
@@ -1263,6 +1216,8 @@ class LoopVectorizeHints {
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
case HK_FORCE:
return (Val <= 1);
+ case HK_ISVECTORIZED:
+ return (Val==0 || Val==1);
}
return false;
}
@@ -1270,16 +1225,21 @@ class LoopVectorizeHints {
/// Vectorization width.
Hint Width;
+
/// Vectorization interleave factor.
Hint Interleave;
+
/// Vectorization forced
Hint Force;
+ /// Already Vectorized
+ Hint IsVectorized;
+
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
/// True if there is any unsafe math in the loop.
- bool PotentiallyUnsafe;
+ bool PotentiallyUnsafe = false;
public:
enum ForceKind {
@@ -1294,7 +1254,7 @@ public:
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
- PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -1302,14 +1262,19 @@ public:
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
+ IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
/// Mark the loop L as already vectorized by setting the width to 1.
void setAlreadyVectorized() {
- Width.Value = Interleave.Value = 1;
- Hint Hints[] = {Width, Interleave};
+ IsVectorized.Value = 1;
+ Hint Hints[] = {IsVectorized};
writeHintsToMetadata(Hints);
}
@@ -1326,19 +1291,19 @@ public:
return false;
}
- if (getWidth() == 1 && getInterleave() == 1) {
- // FIXME: Add a separate metadata to indicate when the loop has already
- // been vectorized instead of setting width and count to 1.
+ if (getIsVectorized() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
// FIXME: Add interleave.disable metadata. This will allow
// vectorize.disable to be used without disabling the pass and errors
// to differentiate between disabled vectorization and a width of 1.
- ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
"AllDisabled", L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: vectorization and interleaving are "
- "explicitly disabled, or vectorize width and interleave "
- "count are both set to 1");
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
return false;
}
@@ -1348,29 +1313,35 @@ public:
/// Dumps all the hint information.
void emitRemarkWithHints() const {
using namespace ore;
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
TheLoop->getStartLoc(),
TheLoop->getHeader())
- << "loop not vectorized: vectorization is explicitly disabled");
- else {
- OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
- TheLoop->getStartLoc(), TheLoop->getHeader());
- R << "loop not vectorized";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=" << NV("Force", true);
- if (Width.Value != 0)
- R << ", Vector Width=" << NV("VectorWidth", Width.Value);
- if (Interleave.Value != 0)
- R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
- R << ")";
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ if (Interleave.Value != 0)
+ R << ", Interleave Count="
+ << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
}
- ORE.emit(R);
- }
+ });
}
unsigned getWidth() const { return Width.Value; }
unsigned getInterleave() const { return Interleave.Value; }
+ unsigned getIsVectorized() const { return IsVectorized.Value; }
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
/// \brief If hints are provided that force vectorization, use the AlwaysPrint
@@ -1454,7 +1425,7 @@ private:
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force};
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
@@ -1489,7 +1460,7 @@ private:
/// Sets current hints into loop metadata, keeping other values intact.
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
- if (HintTypes.size() == 0)
+ if (HintTypes.empty())
return;
// Reserve the first element to LoopID (see below).
@@ -1525,6 +1496,8 @@ private:
OptimizationRemarkEmitter &ORE;
};
+} // end anonymous namespace
+
static void emitMissedWarning(Function *F, Loop *L,
const LoopVectorizeHints &LH,
OptimizationRemarkEmitter *ORE) {
@@ -1546,6 +1519,8 @@ static void emitMissedWarning(Function *F, Loop *L,
}
}
+namespace {
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
@@ -1568,22 +1543,20 @@ public:
std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
LoopVectorizeHints *H)
- : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
- GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
- PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
- Requirements(R), Hints(H) {}
+ : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
+ ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
- typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
+ using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
/// InductionList saves induction variables and maps them to the
/// induction descriptor.
- typedef MapVector<PHINode *, InductionDescriptor> InductionList;
+ using InductionList = MapVector<PHINode *, InductionDescriptor>;
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
- typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
+ using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
@@ -1608,7 +1581,17 @@ public:
/// Returns the widest induction type.
Type *getWidestInductionType() { return WidestIndTy; }
- /// Returns True if V is an induction variable in this loop.
+ /// Returns True if V is a Phi node of an induction variable in this loop.
+ bool isInductionPhi(const Value *V);
+
+ /// Returns True if V is a cast that is part of an induction def-use chain,
+ /// and had been proven to be redundant under a runtime guard (in other
+ /// words, the cast has the same SCEV expression as the induction phi).
+ bool isCastedInductionVariable(const Value *V);
+
+ /// Returns True if V can be considered as an induction variable in this
+ /// loop. V can be the induction phi, or some redundant cast in the def-use
+ /// chain of the inducion phi.
bool isInductionVariable(const Value *V);
/// Returns True if PN is a reduction variable in this loop.
@@ -1629,6 +1612,8 @@ public:
/// 0 - Stride is unknown or non-consecutive.
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
+ /// NOTE: This method must only be used before modifying the original scalar
+ /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
int isConsecutivePtr(Value *Ptr);
/// Returns true if the value V is uniform within the loop.
@@ -1646,11 +1631,6 @@ public:
return InterleaveInfo.isInterleaved(Instr);
}
- /// \brief Return the maximum interleave factor of all interleaved groups.
- unsigned getMaxInterleaveFactor() const {
- return InterleaveInfo.getMaxInterleaveFactor();
- }
-
/// \brief Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
return InterleaveInfo.getInterleaveGroup(Instr);
@@ -1664,6 +1644,10 @@ public:
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
+ uint64_t getMaxSafeRegisterWidth() const {
+ return LAI->getDepChecker().getMaxSafeRegisterWidth();
+ }
+
bool hasStride(Value *V) { return LAI->hasStride(V); }
/// Returns true if the target machine supports masked store operation
@@ -1671,21 +1655,25 @@ public:
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
}
+
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
+
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType) {
return TTI->isLegalMaskedScatter(DataType);
}
+
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType) {
return TTI->isLegalMaskedGather(DataType);
}
+
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
@@ -1701,6 +1689,7 @@ public:
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
unsigned getNumPredStores() const { return NumPredStores; }
@@ -1766,27 +1755,34 @@ private:
return LAI ? &LAI->getSymbolicStrides() : nullptr;
}
- unsigned NumPredStores;
+ unsigned NumPredStores = 0;
/// The loop that we evaluate.
Loop *TheLoop;
+
/// A wrapper around ScalarEvolution used to add runtime SCEV checks.
/// Applies dynamic knowledge to simplify SCEV expressions in the context
/// of existing SCEV assumptions. The analysis will also add a minimal set
/// of new predicates if this is required to enable vectorization and
/// unrolling.
PredicatedScalarEvolution &PSE;
+
/// Target Library Info.
TargetLibraryInfo *TLI;
+
/// Target Transform Info
const TargetTransformInfo *TTI;
+
/// Dominator Tree.
DominatorTree *DT;
+
// LoopAccess analysis.
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
+
// And the loop-accesses info corresponding to this loop. This pointer is
// null until canVectorizeMemory sets it up.
- const LoopAccessInfo *LAI;
+ const LoopAccessInfo *LAI = nullptr;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
@@ -1798,27 +1794,38 @@ private:
/// Holds the primary induction variable. This is the counter of the
/// loop.
- PHINode *PrimaryInduction;
+ PHINode *PrimaryInduction = nullptr;
+
/// Holds the reduction variables.
ReductionList Reductions;
+
/// Holds all of the induction variables that we found in the loop.
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions;
+
+ /// Holds all the casts that participate in the update chain of the induction
+ /// variables, and that have been proven to be redundant (possibly under a
+ /// runtime guard). These casts can be ignored when creating the vectorized
+ /// loop body.
+ SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
+
/// Holds the phi nodes that are first-order recurrences.
RecurrenceSet FirstOrderRecurrences;
+
/// Holds instructions that need to sink past other instructions to handle
/// first-order recurrences.
DenseMap<Instruction *, Instruction *> SinkAfter;
+
/// Holds the widest induction type encountered.
- Type *WidestIndTy;
+ Type *WidestIndTy = nullptr;
/// Allowed outside users. This holds the induction and reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value *, 4> AllowedExit;
/// Can we assume the absence of NaNs.
- bool HasFunNoNaNAttr;
+ bool HasFunNoNaNAttr = false;
/// Vectorization requirements that will go through late-evaluation.
LoopVectorizationRequirements *Requirements;
@@ -1856,9 +1863,13 @@ public:
/// Information about vectorization costs
struct VectorizationFactor {
- unsigned Width; // Vector width with best cost
- unsigned Cost; // Cost of the loop with that width
+ // Vector width with best cost
+ unsigned Width;
+
+ // Cost of the loop with that width
+ unsigned Cost;
};
+
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
@@ -1897,8 +1908,10 @@ public:
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
unsigned LoopInvariantRegs;
+
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers;
+
/// Holds the number of instructions in the loop.
unsigned NumInstructions;
};
@@ -1920,6 +1933,7 @@ public:
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
@@ -1954,7 +1968,8 @@ public:
/// Decision that was taken during cost calculation for memory instruction.
enum InstWidening {
CM_Unknown,
- CM_Widen,
+ CM_Widen, // For consecutive accesses with stride +1.
+ CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
CM_Scalarize
@@ -2010,7 +2025,6 @@ public:
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
-
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
@@ -2030,13 +2044,29 @@ public:
return false;
// If the truncated value is not an induction variable, return false.
- return Legal->isInductionVariable(Op);
+ return Legal->isInductionPhi(Op);
+ }
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
+ /// Collect Uniform and Scalar values for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
+ void collectUniformsAndScalars(unsigned VF) {
+ // Do the analysis once.
+ if (VF == 1 || Uniforms.count(VF))
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ collectLoopScalars(VF);
}
private:
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
- unsigned computeFeasibleMaxVF(bool OptForSize);
+ unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -2045,7 +2075,7 @@ private:
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
- typedef std::pair<unsigned, bool> VectorizationCostTy;
+ using VectorizationCostTy = std::pair<unsigned, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
@@ -2102,7 +2132,7 @@ private:
/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
- typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+ using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
@@ -2134,10 +2164,6 @@ private:
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
unsigned VF);
- /// Collects the instructions to scalarize for each predicated instruction in
- /// the loop.
- void collectInstsToScalarize(unsigned VF);
-
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
@@ -2156,72 +2182,137 @@ private:
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);
- /// Collect Uniform and Scalar values for the given \p VF.
- /// The sets depend on CM decision for Load/Store instructions
- /// that may be vectorized as interleave, gather-scatter or scalarized.
- void collectUniformsAndScalars(unsigned VF) {
- // Do the analysis once.
- if (VF == 1 || Uniforms.count(VF))
- return;
- setCostBasedWideningDecision(VF);
- collectLoopUniforms(VF);
- collectLoopScalars(VF);
- }
-
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
- typedef DenseMap<std::pair<Instruction *, unsigned>,
- std::pair<InstWidening, unsigned>>
- DecisionList;
+ using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
+ std::pair<InstWidening, unsigned>>;
DecisionList WideningDecisions;
public:
/// The loop that we evaluate.
Loop *TheLoop;
+
/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;
+
/// Loop Info analysis.
LoopInfo *LI;
+
/// Vectorization legality.
LoopVectorizationLegality *Legal;
+
/// Vector target information.
const TargetTransformInfo &TTI;
+
/// Target Library Info.
const TargetLibraryInfo *TLI;
+
/// Demanded bits analysis.
DemandedBits *DB;
+
/// Assumption cache.
AssumptionCache *AC;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
const Function *TheFunction;
+
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
+
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
+} // end anonymous namespace
+
+namespace llvm {
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// LoopVectorizationPlanner - drives the vectorization process after having
/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ using VPlanPtr = std::unique_ptr<VPlan>;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
+ return ILV.getOrCreateVectorValue(V, Part);
+ }
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+ unsigned BestVF = 0;
+ unsigned BestUF = 0;
+
public:
- LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM)
- : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
-
- ~LoopVectorizationPlanner() {}
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
/// Plan how to best vectorize, return the best VF and its cost.
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
- /// Generate the IR code for the vectorized loop.
- void executePlan(InnerLoopVectorizer &ILV);
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
protected:
/// Collect the instructions from the original loop that would be trivially
@@ -2229,20 +2320,102 @@ protected:
void collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions);
-private:
- /// The loop that we evaluate.
- Loop *OrigLoop;
+ /// A range of powers-of-2 vectorization factors with fixed start and
+ /// adjustable end. The range includes start and excludes end, e.g.,:
+ /// [1, 9) = {1, 2, 4, 8}
+ struct VFRange {
+ // A power of 2.
+ const unsigned Start;
- /// Loop Info analysis.
- LoopInfo *LI;
+ // Need not be a power of 2. If End <= Start range is empty.
+ unsigned End;
+ };
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+ // Check if \I is a memory instruction to be widened for \p Range.Start and
+ // potentially masked. Such instructions are handled by a recipe that takes an
+ // additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
+ VFRange &Range,
+ VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+ /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+ /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+ /// extended to include \p I or else build a new VPWidenRecipe for it and
+ /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+ /// false otherwise. Range.End may be decreased to ensure same decision from
+ /// \p Range.Start to \p Range.End.
+ bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range,
+ const SmallPtrSetImpl<Value *> &NeedDef);
};
+} // end namespace llvm
+
+namespace {
+
/// \brief This holds vectorization requirements that must be verified late in
/// the process. The requirements are set by legalize and costmodel. Once
/// vectorization has been determined to be possible and profitable the
@@ -2257,8 +2430,7 @@ private:
/// followed by a non-expert user.
class LoopVectorizationRequirements {
public:
- LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
- : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
+ LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
void addUnsafeAlgebraInst(Instruction *I) {
// First unsafe algebra instruction.
@@ -2272,12 +2444,14 @@ public:
const char *PassName = Hints.vectorizeAnalysisPassName();
bool Failed = false;
if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- ORE.emit(
- OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
- UnsafeAlgebraInst->getDebugLoc(),
- UnsafeAlgebraInst->getParent())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "floating-point operations");
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps",
+ UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
Failed = true;
}
@@ -2288,11 +2462,13 @@ public:
NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
if ((ThresholdReached && !Hints.allowReordering()) ||
PragmaThresholdReached) {
- ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations");
+ "memory operations";
+ });
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
@@ -2301,13 +2477,15 @@ public:
}
private:
- unsigned NumRuntimePointerChecks;
- Instruction *UnsafeAlgebraInst;
+ unsigned NumRuntimePointerChecks = 0;
+ Instruction *UnsafeAlgebraInst = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
};
+} // end anonymous namespace
+
static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty()) {
if (!hasCyclesInLoopBody(L))
@@ -2318,11 +2496,15 @@ static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
addAcyclicInnerLoop(*InnerL, V);
}
+namespace {
+
/// The LoopVectorize Pass.
struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;
+ LoopVectorizePass Impl;
+
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
: FunctionPass(ID) {
Impl.DisableUnrolling = NoUnrolling;
@@ -2330,8 +2512,6 @@ struct LoopVectorize : public FunctionPass {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
- LoopVectorizePass Impl;
-
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
@@ -2450,6 +2630,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+ recordVectorLoopValueForInductionCast(II, LastInduction, Part);
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
LastInduction = cast<Instruction>(addFastMathFlag(
@@ -2480,11 +2661,26 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
auto *I = cast<Instruction>(U);
return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
- return any_of(IV->users(), isScalarInst);
+ return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
+ const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
+ unsigned Lane) {
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (Casts.empty())
+ return;
+ // Only the first Cast instruction in the Casts vector is of interest.
+ // The rest of the Casts (if exist) have no uses outside the
+ // induction update chain itself.
+ Instruction *CastInst = *Casts.begin();
+ if (Lane < UINT_MAX)
+ VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+ else
+ VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+}
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -2564,6 +2760,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+ recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
}
@@ -2622,7 +2819,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
@@ -2638,7 +2835,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Value *EntryVal,
const InductionDescriptor &ID) {
-
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
@@ -2663,21 +2859,21 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
-
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
+ : VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
- VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
+ VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
+ recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
}
}
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
-
const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
ValueToValueMap();
@@ -2708,8 +2904,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// instead. If it has been scalarized, and we actually need the value in
// vector form, we will construct the vector values on demand.
if (VectorLoopValueMap.hasAnyScalarValue(V)) {
-
- Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
+ Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
// If we've scalarized a value, that value should be an instruction.
auto *I = cast<Instruction>(V);
@@ -2726,8 +2921,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
- auto *LastInst =
- cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
+ auto *LastInst = cast<Instruction>(
+ VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
@@ -2744,14 +2939,15 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
Value *VectorValue = nullptr;
if (Cost->isUniformAfterVectorization(I, VF)) {
VectorValue = getBroadcastInstrs(ScalarValue);
+ VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
- VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+ // Initialize packing with insertelements to start from undef.
+ Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Undef);
for (unsigned Lane = 0; Lane < VF; ++Lane)
- VectorValue = Builder.CreateInsertElement(
- VectorValue, getOrCreateScalarValue(V, Part, Lane),
- Builder.getInt32(Lane));
+ packScalarIntoVectorValue(V, {Part, Lane});
+ VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
}
- VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
Builder.restoreIP(OldIP);
return VectorValue;
}
@@ -2763,28 +2959,29 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
return B;
}
-Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
- unsigned Lane) {
-
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) {
// If the value is not an instruction contained in the loop, it should
// already be scalar.
if (OrigLoop->isLoopInvariant(V))
return V;
- assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
- : true && "Uniform values only have lane zero");
+ assert(Instance.Lane > 0
+ ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+ : true && "Uniform values only have lane zero");
// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
- if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
- return VectorLoopValueMap.getScalarValue(V, Part, Lane);
+ if (VectorLoopValueMap.hasScalarValue(V, Instance))
+ return VectorLoopValueMap.getScalarValue(V, Instance);
// If the value has not been scalarized, get its entry in VectorLoopValueMap
// for the given unroll part. If this entry is not a vector type (i.e., the
// vectorization factor is one), there is no need to generate an
// extractelement instruction.
- auto *U = getOrCreateVectorValue(V, Part);
+ auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
assert(VF == 1 && "Value not scalarized has non-vector type");
return U;
@@ -2793,7 +2990,20 @@ Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
// Otherwise, the value from the original loop has been vectorized and is
// represented by UF vector values. Extract and return the requested scalar
// value from the appropriate vector lane.
- return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
+ return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+ Value *V, const VPIteration &Instance) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+ Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+ Builder.getInt32(Instance.Lane));
+ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -2843,6 +3053,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Instr != Group->getInsertPos())
return;
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
Value *Ptr = getPointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
@@ -2866,7 +3077,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Index += (VF - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
- Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
+ Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2890,13 +3101,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
-
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
auto *NewLoad = Builder.CreateAlignedLoad(
NewPtrs[Part], Group->getAlignment(), "wide.vec");
- addMetadata(NewLoad, Instr);
+ Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
}
@@ -2917,7 +3127,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
- StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}
if (Group->isReverse())
@@ -2946,9 +3156,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Group->isReverse())
StoredVec = reverseVector(StoredVec);
- // If this member has different type, cast it to an unified type.
+ // If this member has different type, cast it to a unified type.
+
if (StoredVec->getType() != SubVT)
- StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+ StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
StoredVecs.push_back(StoredVec);
}
@@ -2963,11 +3174,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Instruction *NewStoreInstr =
Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
- addMetadata(NewStoreInstr, Instr);
+
+ Group->addMetadata(NewStoreInstr);
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+ VectorParts *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2992,14 +3205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = getMemInstAddressSpace(Instr);
- // Scalarize the memory instruction if necessary.
- if (Decision == LoopVectorizationCostModel::CM_Scalarize)
- return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
-
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = ConsecutiveStride < 0;
+ bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+ bool ConsecutiveStride =
+ Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
bool CreateGatherScatter =
(Decision == LoopVectorizationCostModel::CM_GatherScatter);
@@ -3010,9 +3220,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// Handle consecutive loads/stores.
if (ConsecutiveStride)
- Ptr = getOrCreateScalarValue(Ptr, 0, 0);
+ Ptr = getOrCreateScalarValue(Ptr, {0, 0});
+
+ VectorParts Mask;
+ bool isMaskRequired = BlockInMask;
+ if (isMaskRequired)
+ Mask = *BlockInMask;
- VectorParts Mask = createBlockInMask(Instr->getParent());
// Handle Stores:
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
@@ -3023,7 +3237,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Instruction *NewSI = nullptr;
Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
if (CreateGatherScatter) {
- Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
@@ -3045,13 +3259,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr =
Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- Mask[Part] = reverseVector(Mask[Part]);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr =
Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- if (Legal->isMaskRequired(SI))
+ if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
Mask[Part]);
else
@@ -3068,7 +3283,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
- Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
+ Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
@@ -3083,12 +3298,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// wide load needs to start at the last vector element.
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- Mask[Part] = reverseVector(Mask[Part]);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr =
Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- if (Legal->isMaskRequired(LI))
+ if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
"wide.masked.load");
@@ -3105,71 +3321,41 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ const VPIteration &Instance,
bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
- DEBUG(dbgs() << "LV: Scalarizing"
- << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
- << '\n');
- // Holds vector parameters or scalars, in case of uniform vals.
- SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- VectorParts Cond;
- if (IfPredicateInstr)
- Cond = createBlockInMask(Instr->getParent());
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
-
- // For each vector unroll 'part':
- for (unsigned Part = 0; Part < UF; ++Part) {
- // For each scalar that we create:
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
- // Start if-block.
- Value *Cmp = nullptr;
- if (IfPredicateInstr) {
- Cmp = Cond[Part];
- if (Cmp->getType()->isVectorTy())
- Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
- Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
- ConstantInt::get(Cmp->getType(), 1));
- }
-
- Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
- Cloned->setName(Instr->getName() + ".cloned");
-
- // Replace the operands of the cloned instructions with their scalar
- // equivalents in the new loop.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
- Cloned->setOperand(op, NewOp);
- }
- addNewMetadata(Cloned, Instr);
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+ Cloned->setOperand(op, NewOp);
+ }
+ addNewMetadata(Cloned, Instr);
- // Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
- // Add the cloned scalar to the scalar map entry.
- VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
+ // Add the cloned scalar to the scalar map entry.
+ VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
- // If we just cloned a new assumption, add it the assumption cache.
- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
+ // If we just cloned a new assumption, add it the assumption cache.
+ if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
- // End if-block.
- if (IfPredicateInstr)
- PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
- }
- }
+ // End if-block.
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(Cloned);
}
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -3281,6 +3467,36 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
return VectorTripCount;
}
+Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL) {
+ // Verify that V is a vector type with same number of elements as DstVTy.
+ unsigned VF = DstVTy->getNumElements();
+ VectorType *SrcVecTy = cast<VectorType>(V->getType());
+ assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ Type *SrcElemTy = SrcVecTy->getElementType();
+ Type *DstElemTy = DstVTy->getElementType();
+ assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+ "Vector elements must have same size");
+
+ // Do a direct cast if element types are castable.
+ if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+ return Builder.CreateBitOrPointerCast(V, DstVTy);
+ }
+ // V cannot be directly casted to desired vector type.
+ // May happen when V is a floating point vector but DstVTy is a vector of
+ // pointers or vice-versa. Handle this using a two-step bitcast using an
+ // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+ assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+ "Only one type should be a pointer type");
+ assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+ "Only one type should be a floating point type");
+ Type *IntTy =
+ IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+ VectorType *VecIntTy = VectorType::get(IntTy, VF);
+ Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+ return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+}
+
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
@@ -3373,7 +3589,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3435,7 +3651,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
// Create and register the new vector loop.
- Loop *Lp = new Loop();
+ Loop *Lp = LI->AllocateLoop();
Loop *ParentLoop = OrigLoop->getParentLoop();
// Insert the new loop into the loop nest and register the new basic blocks
@@ -3554,6 +3770,8 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();
+
+ return LoopVectorPreHeader;
}
// Fix up external users of the induction variable. At this point, we are
@@ -3622,22 +3840,27 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
namespace {
+
struct CSEDenseMapInfo {
static bool canHandle(const Instruction *I) {
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}
+
static inline Instruction *getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
+
static inline Instruction *getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
+
static unsigned getHashValue(const Instruction *I) {
assert(canHandle(I) && "Unknown instruction!");
return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
I->value_op_end()));
}
+
static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
LHS == getTombstoneKey() || RHS == getTombstoneKey())
@@ -3645,7 +3868,8 @@ struct CSEDenseMapInfo {
return LHS->isIdenticalTo(RHS);
}
};
-}
+
+} // end anonymous namespace
///\brief Perform cse of induction variable instructions.
static void cse(BasicBlock *BB) {
@@ -3777,7 +4001,6 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
- //
SmallPtrSet<Value *, 4> Erased;
for (const auto &KV : Cost->getMinimalBitwidths()) {
// If the value wasn't vectorized, we must maintain the original scalar
@@ -3927,7 +4150,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
IVEndValues[Entry.first], LoopMiddleBlock);
fixLCSSAPHIs();
- predicateInstructions();
+ for (Instruction *PI : PredicatedInstructions)
+ sinkScalarOperands(&*PI);
// Remove redundant induction instructions.
cse(LoopVectorBody);
@@ -3953,7 +4177,6 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() {
}
void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
-
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
@@ -4211,7 +4434,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+ Builder.SetInsertPoint(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -4317,7 +4541,6 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
}
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
-
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
auto *VectorLoop = LI->getLoopFor(PredBB);
@@ -4346,7 +4569,6 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// through the worklist doesn't sink a single instruction.
bool Changed;
do {
-
// Add the instructions that need to be reanalyzed to the worklist, and
// reset the changed indicator.
Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
@@ -4365,7 +4587,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// It's legal to sink the instruction if all its uses occur in the
// predicated block. Otherwise, there's nothing to do yet, and we may
// need to reanalyze the instruction.
- if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+ if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
InstsToReanalyze.push_back(I);
continue;
}
@@ -4382,200 +4604,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
-void InnerLoopVectorizer::predicateInstructions() {
-
- // For each instruction I marked for predication on value C, split I into its
- // own basic block to form an if-then construct over C. Since I may be fed by
- // an extractelement instruction or other scalar operand, we try to
- // iteratively sink its scalar operands into the predicated block. If I feeds
- // an insertelement instruction, we try to move this instruction into the
- // predicated block as well. For non-void types, a phi node will be created
- // for the resulting value (either vector or scalar).
- //
- // So for some predicated instruction, e.g. the conditional sdiv in:
- //
- // for.body:
- // ...
- // %add = add nsw i32 %mul, %0
- // %cmp5 = icmp sgt i32 %2, 7
- // br i1 %cmp5, label %if.then, label %if.end
- //
- // if.then:
- // %div = sdiv i32 %0, %1
- // br label %if.end
- //
- // if.end:
- // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
- //
- // the sdiv at this point is scalarized and if-converted using a select.
- // The inactive elements in the vector are not used, but the predicated
- // instruction is still executed for all vector elements, essentially:
- //
- // vector.body:
- // ...
- // %17 = add nsw <2 x i32> %16, %wide.load
- // %29 = extractelement <2 x i32> %wide.load, i32 0
- // %30 = extractelement <2 x i32> %wide.load51, i32 0
- // %31 = sdiv i32 %29, %30
- // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
- // %35 = extractelement <2 x i32> %wide.load, i32 1
- // %36 = extractelement <2 x i32> %wide.load51, i32 1
- // %37 = sdiv i32 %35, %36
- // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
- // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
- //
- // Predication will now re-introduce the original control flow to avoid false
- // side-effects by the sdiv instructions on the inactive elements, yielding
- // (after cleanup):
- //
- // vector.body:
- // ...
- // %5 = add nsw <2 x i32> %4, %wide.load
- // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
- // %9 = extractelement <2 x i1> %8, i32 0
- // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
- //
- // pred.sdiv.if:
- // %10 = extractelement <2 x i32> %wide.load, i32 0
- // %11 = extractelement <2 x i32> %wide.load51, i32 0
- // %12 = sdiv i32 %10, %11
- // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
- // br label %pred.sdiv.continue
- //
- // pred.sdiv.continue:
- // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
- // %15 = extractelement <2 x i1> %8, i32 1
- // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
- //
- // pred.sdiv.if54:
- // %16 = extractelement <2 x i32> %wide.load, i32 1
- // %17 = extractelement <2 x i32> %wide.load51, i32 1
- // %18 = sdiv i32 %16, %17
- // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
- // br label %pred.sdiv.continue55
- //
- // pred.sdiv.continue55:
- // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
- // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
-
- for (auto KV : PredicatedInstructions) {
- BasicBlock::iterator I(KV.first);
- BasicBlock *Head = I->getParent();
- auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
- /*BranchWeights=*/nullptr, DT, LI);
- I->moveBefore(T);
- sinkScalarOperands(&*I);
-
- BasicBlock *PredicatedBlock = I->getParent();
- Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
- PredicatedBlock->setName(BBNamePrefix + ".if");
- PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
-
- // If the instruction is non-void create a Phi node at reconvergence point.
- if (!I->getType()->isVoidTy()) {
- Value *IncomingTrue = nullptr;
- Value *IncomingFalse = nullptr;
-
- if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
- // If the predicated instruction is feeding an insert-element, move it
- // into the Then block; Phi node will be created for the vector.
- InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
- IEI->moveBefore(T);
- IncomingTrue = IEI; // the new vector with the inserted element.
- IncomingFalse = IEI->getOperand(0); // the unmodified vector
- } else {
- // Phi node will be created for the scalar predicated instruction.
- IncomingTrue = &*I;
- IncomingFalse = UndefValue::get(I->getType());
- }
-
- BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
- assert(PostDom && "Then block has multiple successors");
- PHINode *Phi =
- PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
- IncomingTrue->replaceAllUsesWith(Phi);
- Phi->addIncoming(IncomingFalse, Head);
- Phi->addIncoming(IncomingTrue, I->getParent());
- }
- }
-
- DEBUG(DT->verifyDomTree());
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
- if (ECEntryIt != EdgeMaskCache.end())
- return ECEntryIt->second;
-
- VectorParts SrcMask = createBlockInMask(Src);
-
- // The terminator has to be a branch inst!
- BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
- assert(BI && "Unexpected terminator found");
-
- if (BI->isConditional()) {
-
- VectorParts EdgeMask(UF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
- if (BI->getSuccessor(0) != Dst)
- EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
-
- EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
- EdgeMask[Part] = EdgeMaskPart;
- }
-
- EdgeMaskCache[Edge] = EdgeMask;
- return EdgeMask;
- }
-
- EdgeMaskCache[Edge] = SrcMask;
- return SrcMask;
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
-
- VectorParts BlockMask(UF);
-
- // Loop incoming mask is all-one.
- if (OrigLoop->getHeader() == BB) {
- Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = getOrCreateVectorValue(C, Part);
- BlockMaskCache[BB] = BlockMask;
- return BlockMask;
- }
-
- // This is the block mask. We OR all incoming edges, and with zero.
- Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
-
- // For each pred:
- for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
- VectorParts EM = createEdgeMask(*It, BB);
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
- }
-
- BlockMaskCache[BB] = BlockMask;
- return BlockMask;
-}
-
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
unsigned VF) {
+ assert(PN->getParent() == OrigLoop->getHeader() &&
+ "Non-header phis should have been handled elsewhere");
+
PHINode *P = cast<PHINode>(PN);
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
@@ -4594,43 +4627,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
}
setDebugLocFromInst(Builder, P);
- // Check for PHI nodes that are lowered to vector selects.
- if (P->getParent() != OrigLoop->getHeader()) {
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = P->getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // ( ...)))
- VectorParts Entry(UF);
- for (unsigned In = 0; In < NumIncoming; In++) {
- VectorParts Cond =
- createEdgeMask(P->getIncomingBlock(In), P->getParent());
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- if (In == 0)
- Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
- else
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
- "predphi");
- }
- }
- for (unsigned Part = 0; Part < UF; ++Part)
- VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
- return;
- }
// This PHINode must be an induction variable.
// Make sure that we know about it.
@@ -4646,7 +4642,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:
- return widenIntOrFpInduction(P);
+ llvm_unreachable("Integer/fp induction is handled elsewhere.");
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4665,7 +4661,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}
}
return;
@@ -4691,25 +4687,11 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
- // Scalarize instructions that should remain scalar after vectorization.
- if (VF > 1 &&
- !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
- shouldScalarizeInstruction(&I)) {
- scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
- return;
- }
-
+void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
- // Nothing to do for PHIs and BR, since we already took care of the
- // loop control flow instructions.
- break;
- case Instruction::PHI: {
- // Vectorize PHINodes.
- widenPHIInstruction(&I, UF, VF);
- break;
- } // End of PHI.
+ case Instruction::PHI:
+ llvm_unreachable("This instruction is handled by a different recipe.");
case Instruction::GetElementPtr: {
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
@@ -4746,7 +4728,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
// values in the vector mapping with initVector, as we do for other
// instructions.
for (unsigned Part = 0; Part < UF; ++Part) {
-
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
auto *Ptr =
@@ -4782,13 +4763,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // Scalarize with predication if this instruction may divide by zero and
- // block execution is conditional, otherwise fallthrough.
- if (Legal->isScalarWithPredication(&I)) {
- scalarizeInstruction(&I, true);
- break;
- }
- LLVM_FALLTHROUGH;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -4836,7 +4810,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
- auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
+ auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
@@ -4862,8 +4836,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
Value *C = nullptr;
if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
@@ -4874,10 +4850,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
break;
}
- case Instruction::Store:
- case Instruction::Load:
- vectorizeMemoryInstruction(&I);
- break;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -4893,16 +4865,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
auto *CI = dyn_cast<CastInst>(&I);
setDebugLocFromInst(Builder, CI);
- // Optimize the special case where the source is a constant integer
- // induction variable. Notice that we can only optimize the 'trunc' case
- // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
- // (c) other casts depend on pointer size.
- if (Cost->isOptimizableIVTruncate(CI, VF)) {
- widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
- cast<TruncInst>(CI));
- break;
- }
-
/// Vectorize casts.
Type *DestTy =
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
@@ -4933,11 +4895,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start)) {
- scalarizeInstruction(&I);
- break;
- }
+
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
@@ -4945,10 +4903,8 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
- if (!UseVectorIntrinsic && NeedToScalarize) {
- scalarizeInstruction(&I);
- break;
- }
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
@@ -4998,9 +4954,9 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
}
default:
- // All other instructions are unsupported. Scalarize them.
- scalarizeInstruction(&I);
- break;
+ // This instruction is not vectorized by simple widening.
+ DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
} // end of switch.
}
@@ -5012,14 +4968,11 @@ void InnerLoopVectorizer::updateAnalysis() {
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
- DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
- LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock,
LI->getLoopFor(LoopVectorBody)->getLoopLatch());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-
DEBUG(DT->verifyDomTree());
}
@@ -5094,12 +5047,15 @@ bool LoopVectorizationLegality::canVectorize() {
// Store the result and return it at the end instead of exiting early, in case
// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (DoExtraAnalysis)
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
if (!TheLoop->getLoopPreheader()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5112,7 +5068,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->empty()) {
ORE->emit(createMissedAnalysis("NotInnermostLoop")
<< "loop is not the innermost loop");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5122,7 +5078,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (TheLoop->getNumBackEdges() != 1) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5132,7 +5088,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getExitingBlock()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5144,7 +5100,7 @@ bool LoopVectorizationLegality::canVectorize() {
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5158,7 +5114,7 @@ bool LoopVectorizationLegality::canVectorize() {
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5167,7 +5123,7 @@ bool LoopVectorizationLegality::canVectorize() {
// Check if we can vectorize the instructions and CFG in this loop.
if (!canVectorizeInstrs()) {
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5176,7 +5132,7 @@ bool LoopVectorizationLegality::canVectorize() {
// Go over each instruction and look at memory deps.
if (!canVectorizeMemory()) {
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5207,7 +5163,7 @@ bool LoopVectorizationLegality::canVectorize() {
<< "Too many SCEV assumptions need to be made and checked "
<< "at runtime");
DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
- if (ORE->allowExtraAnalysis())
+ if (DoExtraAnalysis)
Result = false;
else
return false;
@@ -5263,6 +5219,15 @@ void LoopVectorizationLegality::addInductionPhi(
PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit) {
Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
Type *PhiTy = Phi->getType();
const DataLayout &DL = Phi->getModule()->getDataLayout();
@@ -5300,7 +5265,6 @@ void LoopVectorizationLegality::addInductionPhi(
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
- return;
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
@@ -5439,7 +5403,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// operations, shuffles, or casts, as they don't change precision or
// semantics.
} else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.hasUnsafeAlgebra()) {
+ !I.isFast()) {
DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
Hints->setPotentiallyUnsafe();
}
@@ -5451,7 +5415,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
<< "value cannot be used outside the loop");
return false;
}
-
} // next instr.
}
@@ -5474,7 +5437,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
-
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
@@ -5517,7 +5479,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
-
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5532,7 +5493,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// If the use of the pointer will be a scalar use, and all users of the
// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
// place the pointer in PossibleNonScalarPtrs.
- if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+ if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
return isa<LoadInst>(U) || isa<StoreInst>(U);
}))
ScalarPtrs.insert(I);
@@ -5604,7 +5565,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
continue;
auto *Src = cast<Instruction>(Dst->getOperand(0));
- if (all_of(Src->users(), [&](User *U) -> bool {
+ if (llvm::all_of(Src->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
@@ -5631,7 +5592,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// Determine if all users of the induction variable are scalar after
// vectorization.
- auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
});
@@ -5640,10 +5601,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// Determine if all users of the induction variable update instruction are
// scalar after vectorization.
- auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
- });
+ auto ScalarIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ });
if (!ScalarIndUpdate)
continue;
@@ -5703,7 +5665,6 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
}
void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
-
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
@@ -5754,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
"Widening decision should be ready at this moment");
return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
WideningDecision == CM_Interleave);
};
// Iterate over the instructions in the loop, and collect all
@@ -5766,7 +5728,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// the getelementptr won't remain uniform.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
-
// If there's no pointer operand, there's nothing to do.
auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
if (!Ptr)
@@ -5774,9 +5735,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// True if all users of Ptr are memory accesses that have Ptr as their
// pointer operand.
- auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
- return getPointerOperand(U) == Ptr;
- });
+ auto UsersAreMemAccesses =
+ llvm::all_of(Ptr->users(), [&](User *U) -> bool {
+ return getPointerOperand(U) == Ptr;
+ });
// Ensure the memory instruction will not be scalarized or used by
// gather/scatter, making its pointer operand non-uniform. If the pointer
@@ -5812,7 +5774,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (isOutOfScope(OV))
continue;
auto *OI = cast<Instruction>(OV);
- if (all_of(OI->users(), [&](User *U) -> bool {
+ if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
(OI == getPointerOperand(J) && isUniformDecision(J, VF));
@@ -5841,7 +5803,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Determine if all users of the induction variable are uniform after
// vectorization.
- auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
isVectorizedMemAccessUse(I, Ind);
@@ -5851,11 +5813,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Determine if all users of the induction variable update instruction are
// uniform after vectorization.
- auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
- isVectorizedMemAccessUse(I, IndUpdate);
- });
+ auto UniformIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, IndUpdate);
+ });
if (!UniformIndUpdate)
continue;
@@ -5874,9 +5837,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
InterleaveInfo.setLAI(LAI);
const OptimizationRemarkAnalysis *LAR = LAI->getReport();
if (LAR) {
- OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
- "loop not vectorized: ", *LAR);
- ORE->emit(VR);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
}
if (!LAI->canVectorizeMemory())
return false;
@@ -5894,7 +5858,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
return true;
}
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
@@ -5903,6 +5867,15 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
return Inductions.count(PN);
}
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
return FirstOrderRecurrences.count(Phi);
}
@@ -5972,7 +5945,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
void InterleavedAccessInfo::collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides) {
-
auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
// Since it's desired that the load/store instructions be maintained in
@@ -6126,7 +6098,6 @@ void InterleavedAccessInfo::analyzeInterleaving(
// but not with (4). If we did, the dependent access (3) would be within
// the boundaries of the (2, 4) group.
if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
-
// If a dependence exists and A is already in a group, we know that A
// must be a store since A precedes B and WAR dependences are allowed.
// Thus, A would be sunk below B. We release A's group to prevent this
@@ -6205,9 +6176,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Remove interleaved store groups with gaps.
for (InterleaveGroup *Group : StoreGroups)
- if (Group->getNumMembers() != Group->getFactor())
+ if (Group->getNumMembers() != Group->getFactor()) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
+ "to gaps.\n");
releaseGroup(Group);
-
+ }
// Remove interleaved groups with gaps (currently only loads) whose memory
// accesses may wrap around. We have to revisit the getPtrStride analysis,
// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
@@ -6222,9 +6195,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// This means that we can forcefully peel the loop in order to only have to
// check the first pointer for no-wrap. When we'll change to use Assume=true
// we'll only need at most one runtime check per interleaved group.
- //
for (InterleaveGroup *Group : LoadGroups) {
-
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// load would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
@@ -6260,6 +6231,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
// to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "a reverse access with gaps.\n");
releaseGroup(Group);
continue;
}
@@ -6277,8 +6250,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+
+ ORE->emit(
+ createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+ << "runtime pointer checks needed. Not enabled for divergent target");
+
+ return None;
+ }
+
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
- return computeFeasibleMaxVF(OptForSize);
+ return computeFeasibleMaxVF(OptForSize, TC);
if (Legal->getRuntimePointerChecking()->Need) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -6291,7 +6277,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
}
// If we optimize the program for size, avoid creating the tail loop.
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
// If we don't know the precise trip count, don't try to vectorize.
@@ -6303,7 +6288,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
- unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+ unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
if (TC % MaxVF != 0) {
// If the trip count that we found modulo the vectorization factor is not
@@ -6324,46 +6309,52 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return MaxVF;
}
-unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
+unsigned
+LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
+ unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
- unsigned MaxSafeDepDist = -1U;
- // Get the maximum safe dependence distance in bits computed by LAA. If the
- // loop contains any interleaved accesses, we divide the dependence distance
- // by the maximum interleave factor of all interleaved groups. Note that
- // although the division ensures correctness, this is a fairly conservative
- // computation because the maximum distance computed by LAA may not involve
- // any of the interleaved accesses.
- if (Legal->getMaxSafeDepDistBytes() != -1U)
- MaxSafeDepDist =
- Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+
+ WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
- WidestRegister =
- ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
<< WidestType << " bits.\n");
- DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
+ DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
<< " bits.\n");
+ assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
+ return MaxVectorSize;
+ } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+ isPowerOf2_32(ConstTripCount)) {
+ // We need to clamp the VF to be the ConstTripCount. There is no point in
+ // choosing a higher viable VF as done in the loop below.
+ DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
+ MaxVectorSize = ConstTripCount;
+ return MaxVectorSize;
}
- assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
- " into one vector!");
-
unsigned MaxVF = MaxVectorSize;
if (MaximizeBandwidth && !OptForSize) {
- // Collect all viable vectorization factors.
+ // Collect all viable vectorization factors larger than the default MaxVF
+ // (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
- for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+ for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
VFs.push_back(VS);
// For each VF calculate its register usage.
@@ -6485,7 +6476,6 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
unsigned VF,
unsigned LoopCost) {
-
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
@@ -6573,7 +6563,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
- if (VF > 1 && Legal->getReductionVars()->size()) {
+ if (VF > 1 && !Legal->getReductionVars()->empty()) {
DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -6604,7 +6594,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
- if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
+ if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -6623,7 +6613,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
- bool HasReductions = (Legal->getReductionVars()->size() > 0);
+ bool HasReductions = !Legal->getReductionVars()->empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
@@ -6661,7 +6651,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
- typedef DenseMap<Instruction *, unsigned> IntervalMap;
+ using IntervalMap = DenseMap<Instruction *, unsigned>;
+
// Maps instruction to its index.
DenseMap<unsigned, Instruction *> IdxToInstr;
// Marks the end of each interval.
@@ -6700,7 +6691,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
// Saves the list of intervals that end with the index in 'key'.
- typedef SmallVector<Instruction *, 2> InstrList;
+ using InstrList = SmallVector<Instruction *, 2>;
DenseMap<unsigned, InstrList> TransposeEnds;
// Transpose the EndPoints to a list of values that end at each index.
@@ -6795,7 +6786,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
-
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
@@ -6829,7 +6819,6 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
unsigned VF) {
-
assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");
@@ -6844,7 +6833,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Returns true if the given instruction can be scalarized.
auto canBeScalarized = [&](Instruction *I) -> bool {
-
// We only attempt to scalarize instructions forming a single-use chain
// from the original predicated block that would otherwise be vectorized.
// Although not strictly necessary, we give up on instructions we know will
@@ -6947,13 +6935,6 @@ LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;
- // Collect Uniform and Scalar instructions after vectorization with VF.
- collectUniformsAndScalars(VF);
-
- // Collect the instructions (and their associated costs) that will be more
- // profitable to scalarize.
- collectInstsToScalarize(VF);
-
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
@@ -6965,7 +6946,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
continue;
// Skip ignored values.
- if (ValuesToIgnore.count(&I))
+ if (ValuesToIgnore.count(&I) ||
+ (VF > 1 && VecValuesToIgnore.count(&I)))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
@@ -7004,14 +6986,16 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
static const SCEV *getAddressAccessSCEV(
Value *Ptr,
LoopVectorizationLegality *Legal,
- ScalarEvolution *SE,
+ PredicatedScalarEvolution &PSE,
const Loop *TheLoop) {
+
auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
return nullptr;
// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
+ auto SE = PSE.getSE();
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
Value *Opd = Gep->getOperand(i);
@@ -7021,7 +7005,7 @@ static const SCEV *getAddressAccessSCEV(
}
// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
- return SE->getSCEV(Ptr);
+ return PSE.getSCEV(Ptr);
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
@@ -7041,7 +7025,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Figure out whether the access is strided and get the stride value
// if it's known in compile time
- const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
@@ -7145,7 +7129,6 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned VF) {
-
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF == 1) {
@@ -7202,12 +7185,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// We assume that widening is the best solution when possible.
if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
- setWideningDecision(&I, VF, CM_Widen, Cost);
+ int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Expected consecutive stride.");
+ InstWidening Decision =
+ ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ setWideningDecision(&I, VF, Decision, Cost);
continue;
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- unsigned InterleaveCost = UINT_MAX;
+ unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
unsigned NumAccesses = 1;
if (Legal->isAccessInterleaved(&I)) {
auto Group = Legal->getInterleavedAccessGroup(&I);
@@ -7224,7 +7212,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
unsigned GatherScatterCost =
Legal->isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : UINT_MAX;
+ : std::numeric_limits<unsigned>::max();
unsigned ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
@@ -7282,7 +7270,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
for (auto &Op : I->operands())
if (auto *InstOp = dyn_cast<Instruction>(Op))
if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
- AddrDefs.insert(InstOp).second == true)
+ AddrDefs.insert(InstOp).second)
Worklist.push_back(InstOp);
}
@@ -7292,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// by cost functions, but since this involves the task of finding out
// if the loaded register is involved in an address computation, it is
// instead changed here when we know this is the case.
- if (getWideningDecision(I, VF) == CM_Widen)
+ InstWidening Decision = getWideningDecision(I, VF);
+ if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
// Scalarize a widened load of address.
setWideningDecision(I, VF, CM_Scalarize,
(VF * getMemoryInstructionCost(I, 1)));
@@ -7551,7 +7540,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
char LoopVectorize::ID = 0;
+
static const char lv_name[] = "Loop Vectorization";
+
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
@@ -7568,13 +7559,14 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
+
Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
}
-}
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+} // end namespace llvm
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
if (auto *Ptr = getPointerOperand(Inst))
@@ -7593,11 +7585,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+ // Ignore type-casting instructions we identified during induction
+ // detection.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
}
LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
-
// Width 1 means no vectorize, cost 0 means uncomputed cost.
const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
0U};
@@ -7611,11 +7609,26 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
+ buildVPlans(UserVF, UserVF);
+ DEBUG(printPlans(dbgs()));
return {UserVF, 0};
}
unsigned MaxVF = MaybeMaxVF.getValue();
assert(MaxVF != 0 && "MaxVF is zero.");
+
+ for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ CM.collectUniformsAndScalars(VF);
+
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ if (VF > 1)
+ CM.collectInstsToScalarize(VF);
+ }
+
+ buildVPlans(1, MaxVF);
+ DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
return NoVectorization;
@@ -7623,11 +7636,28 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+ DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+ BestVF = VF;
+ BestUF = UF;
+
+ erase_if(VPlans, [VF](const VPlanPtr &Plan) {
+ return !Plan->hasVF(VF);
+ });
+ assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+ DominatorTree *DT) {
// Perform the actual loop transformation.
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
- ILV.createVectorizedLoopSkeleton();
+ VPCallbackILV CallbackILV(ILV);
+
+ VPTransformState State{BestVF, BestUF, LI,
+ DT, ILV.Builder, ILV.VectorLoopValueMap,
+ &ILV, CallbackILV};
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
//===------------------------------------------------===//
//
@@ -7638,36 +7668,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
-
- // Move instructions to handle first-order recurrences.
- DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
- for (auto &Entry : SinkAfter) {
- Entry.first->removeFromParent();
- Entry.first->insertAfter(Entry.second);
- DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
- << " to vectorize a 1st order recurrence.\n");
- }
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- ILV.vectorizeInstruction(I);
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+ VPlans.front()->execute(&State);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7691,18 +7693,23 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
for (auto &Induction : *Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
- }
-}
-
-void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
- auto *SI = dyn_cast<StoreInst>(Instr);
- bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
- return scalarizeInstruction(Instr, IfPredicateInstr);
+ // We record as "Dead" also the type-casting instructions we had identified
+ // during induction analysis. We don't need any handling for them in the
+ // vectorized loop because we have proven that, under a proper runtime
+ // test guarding the vectorized loop, the value of the phi, and the casted
+ // value of the phi, are the same. The last instruction in this casting chain
+ // will get its scalar/vector/widened def from the scalar/vector/widened def
+ // of the respective phi node. Any other casts in the induction def-use chain
+ // have no other uses outside the phi update chain, and will be ignored.
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ DeadInstructions.insert(Casts.begin(), Casts.end());
+ }
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
@@ -7760,6 +7767,722 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
+ const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
+ assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ bool PredicateAtRangeStart = Predicate(Range.Start);
+
+ for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ if (Predicate(TmpVF) != PredicateAtRangeStart) {
+ Range.End = TmpVF;
+ break;
+ }
+
+ return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
+
+ // Collect conditions feeding internal conditional branches; they need to be
+ // represented in VPlan for it to model masking.
+ SmallPtrSet<Value *, 1> NeedDef;
+
+ auto *Latch = OrigLoop->getLoopLatch();
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == Latch)
+ continue;
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Branch && Branch->isConditional())
+ NeedDef.insert(Branch->getCondition());
+ }
+
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(buildVPlan(SubRange, NeedDef));
+ VF = SubRange.End;
+ }
+}
+
+VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
+ BasicBlock *Dst,
+ VPlanPtr &Plan) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ if (ECEntryIt != EdgeMaskCache.end())
+ return ECEntryIt->second;
+
+ VPValue *SrcMask = createBlockInMask(Src, Plan);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ if (!BI->isConditional())
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.createNot(EdgeMask);
+
+ if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
+ EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+
+ return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
+ VPlanPtr &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Look for cached value.
+ BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+ if (BCEntryIt != BlockMaskCache.end())
+ return BCEntryIt->second;
+
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+
+ // Loop incoming mask is all-one.
+ if (OrigLoop->getHeader() == BB)
+ return BlockMaskCache[BB] = BlockMask;
+
+ // This is the block mask. We OR all incoming edges.
+ for (auto *Predecessor : predecessors(BB)) {
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+ if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
+ return BlockMaskCache[BB] = EdgeMask;
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ }
+
+ return BlockMaskCache[BB] = BlockMask;
+}
+
+VPInterleaveRecipe *
+LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range) {
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+ if (!IG)
+ return nullptr;
+
+ // Now check if IG is relevant for VF's in the given range.
+ auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
+ return [=](unsigned VF) -> bool {
+ return (VF >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ };
+ if (!getDecisionAndClampRange(isIGMember(I), Range))
+ return nullptr;
+
+ // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
+ // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
+ // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
+ assert(I == IG->getInsertPos() &&
+ "Generating a recipe for an adjunct member of an interleave group");
+
+ return new VPInterleaveRecipe(IG);
+}
+
+VPWidenMemoryInstructionRecipe *
+LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return nullptr;
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (VF == 1)
+ return false;
+ if (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF))
+ return false;
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point.");
+ assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+ "Interleave memory opportunity should be caught earlier.");
+ return Decision != LoopVectorizationCostModel::CM_Scalarize;
+ };
+
+ if (!getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPWidenMemoryInstructionRecipe(*I, Mask);
+}
+
+VPWidenIntOrFpInductionRecipe *
+LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
+ VFRange &Range) {
+ if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction)
+ return new VPWidenIntOrFpInductionRecipe(Phi);
+
+ return nullptr;
+ }
+
+ // Optimize the special case where the source is a constant integer
+ // induction variable. Notice that we can only optimize the 'trunc' case
+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+ // (c) other casts depend on pointer size.
+
+ // Determine whether \p K is a truncation based on an induction variable that
+ // can be optimized.
+ auto isOptimizableIVTruncate =
+ [&](Instruction *K) -> std::function<bool(unsigned)> {
+ return
+ [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ };
+
+ if (isa<TruncInst>(I) &&
+ getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+ return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
+ cast<TruncInst>(I));
+ return nullptr;
+}
+
+VPBlendRecipe *
+LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi || Phi->getParent() == OrigLoop->getHeader())
+ return nullptr;
+
+ // We know that all PHIs in non-header blocks are converted into selects, so
+ // we don't have to worry about the insertion order and we can just use the
+ // builder. At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ SmallVector<VPValue *, 2> Masks;
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VPValue *EdgeMask =
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ assert((EdgeMask || NumIncoming == 1) &&
+ "Multiple predecessors with one having a full mask");
+ if (EdgeMask)
+ Masks.push_back(EdgeMask);
+ }
+ return new VPBlendRecipe(Phi, Masks);
+}
+
+bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+ VFRange &Range) {
+ if (Legal->isScalarWithPredication(I))
+ return false;
+
+ auto IsVectorizableOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::AShr:
+ case Instruction::BitCast:
+ case Instruction::Br:
+ case Instruction::Call:
+ case Instruction::FAdd:
+ case Instruction::FCmp:
+ case Instruction::FDiv:
+ case Instruction::FMul:
+ case Instruction::FPExt:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::FPTrunc:
+ case Instruction::FRem:
+ case Instruction::FSub:
+ case Instruction::GetElementPtr:
+ case Instruction::ICmp:
+ case Instruction::IntToPtr:
+ case Instruction::Load:
+ case Instruction::LShr:
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::PHI:
+ case Instruction::PtrToInt:
+ case Instruction::SDiv:
+ case Instruction::Select:
+ case Instruction::SExt:
+ case Instruction::Shl:
+ case Instruction::SIToFP:
+ case Instruction::SRem:
+ case Instruction::Store:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::UDiv:
+ case Instruction::UIToFP:
+ case Instruction::URem:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ return true;
+ }
+ return false;
+ };
+
+ if (!IsVectorizableOpcode(I->getOpcode()))
+ return false;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ return false;
+ }
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF)))
+ return false;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+ return UseVectorIntrinsic || !NeedToScalarize;
+ }
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ assert(CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Scalarize &&
+ "Memory widening decisions should have been taken care by now");
+ return false;
+ }
+ return true;
+ };
+
+ if (!getDecisionAndClampRange(willWiden, Range))
+ return false;
+
+ // Success: widen this instruction. We optimize the common case where
+ // consecutive instructions can be represented by a single recipe.
+ if (!VPBB->empty()) {
+ VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
+ if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
+ return true;
+ }
+
+ VPBB->appendRecipe(new VPWidenRecipe(I));
+ return true;
+}
+
+VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan) {
+ bool IsUniform = getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ Range);
+
+ bool IsPredicated = Legal->isScalarWithPredication(I);
+ auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+
+ // Find if I uses a predicated instruction. If so, it will use its scalar
+ // value. Avoid hoisting the insert-element which packs the scalar value into
+ // a vector value, as that happens iff all users use the vector value.
+ for (auto &Op : I->operands())
+ if (auto *PredInst = dyn_cast<Instruction>(Op))
+ if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+ PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+ // Finalize the recipe for Instr, first if it is not predicated.
+ if (!IsPredicated) {
+ DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ VPBB->appendRecipe(Recipe);
+ return VPBB;
+ }
+ DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ assert(VPBB->getSuccessors().empty() &&
+ "VPBB has successors when handling predicated replication.");
+ // Record predicated instructions for above packing optimizations.
+ PredInst2Recipe[I] = Recipe;
+ VPBlockBase *Region =
+ VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
+ return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+}
+
+VPRegionBlock *
+LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
+ // Instructions marked for predication are replicated and placed under an
+ // if-then construct to prevent side-effects.
+
+ // Generate recipes to compute the block mask for this region.
+ VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
+
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *PHIRecipe =
+ Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ Entry->setTwoSuccessors(Pred, Exit);
+ Pred->setOneSuccessor(Exit);
+
+ return Region;
+}
+
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlan(VFRange &Range,
+ const SmallPtrSetImpl<Value *> &NeedDef) {
+ EdgeMaskCache.clear();
+ BlockMaskCache.clear();
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Hold a mapping from predicated instructions to their recipes, in order to
+ // fix their AlsoPack behavior if a user is determined to replicate and use a
+ // scalar instead of vector value.
+ DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+ // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+ auto Plan = llvm::make_unique<VPlan>(VPBB);
+
+ // Represent values that will have defs inside VPlan.
+ for (Value *V : NeedDef)
+ Plan->addVPValue(V);
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ // Relevant instructions from basic block BB will be grouped into VPRecipe
+ // ingredients and fill a new VPBasicBlock.
+ unsigned VPBBsForBB = 0;
+ auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+ VPBB->setOneSuccessor(FirstVPBBForBB);
+ VPBB = FirstVPBBForBB;
+ Builder.setInsertPoint(VPBB);
+
+ std::vector<Instruction *> Ingredients;
+
+ // Organize the ingredients to vectorize from current basic block in the
+ // right order.
+ for (Instruction &I : *BB) {
+ Instruction *Instr = &I;
+
+ // First filter out irrelevant instructions, to ensure no recipes are
+ // built for them.
+ if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
+ DeadInstructions.count(Instr))
+ continue;
+
+ // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
+ // member of the IG, do not construct any Recipe for it.
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+ if (IG && Instr != IG->getInsertPos() &&
+ Range.Start >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(Instr, Range.Start) ==
+ LoopVectorizationCostModel::CM_Interleave) {
+ if (SinkAfterInverse.count(Instr))
+ Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
+ continue;
+ }
+
+ // Move instructions to handle first-order recurrences, step 1: avoid
+ // handling this instruction until after we've handled the instruction it
+ // should follow.
+ auto SAIt = SinkAfter.find(Instr);
+ if (SAIt != SinkAfter.end()) {
+ DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
+ SinkAfterInverse[SAIt->second] = Instr;
+ continue;
+ }
+
+ Ingredients.push_back(Instr);
+
+ // Move instructions to handle first-order recurrences, step 2: push the
+ // instruction to be sunk at its insertion point.
+ auto SAInvIt = SinkAfterInverse.find(Instr);
+ if (SAInvIt != SinkAfterInverse.end())
+ Ingredients.push_back(SAInvIt->second);
+ }
+
+ // Introduce each ingredient into VPlan.
+ for (Instruction *Instr : Ingredients) {
+ VPRecipeBase *Recipe = nullptr;
+
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Check if Instr is a memory operation that should be widened.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+ if ((Recipe = tryToBlend(Instr, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ continue;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if (tryToWiden(Instr, VPBB, Range))
+ continue;
+
+ // Otherwise, if all widening options failed, Instruction is to be
+ // replicated. This may create a successor for VPBB.
+ VPBasicBlock *NextVPBB =
+ handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
+ if (NextVPBB != VPBB) {
+ VPBB = NextVPBB;
+ VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+ : "");
+ }
+ }
+ }
+
+ // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+ // may also be empty, such as the last one VPBB, reflecting original
+ // basic-blocks with no recipes.
+ VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+ assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+ VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+ PreEntry->disconnectSuccessor(Entry);
+ delete PreEntry;
+
+ std::string PlanName;
+ raw_string_ostream RSO(PlanName);
+ unsigned VF = Range.Start;
+ Plan->addVF(VF);
+ RSO << "Initial VPlan for VF={" << VF;
+ for (VF *= 2; VF < Range.End; VF *= 2) {
+ Plan->addVF(VF);
+ RSO << "," << VF;
+ }
+ RSO << "},UF>=1";
+ RSO.flush();
+ Plan->setName(PlanName);
+
+ return Plan;
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << "\\l\"";
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i))
+ O << " +\n"
+ << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
+ for (auto &Instr : make_range(Begin, End))
+ State.ILV->widenInstruction(Instr);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+ State.ILV->widenIntOrFpInduction(IV, Trunc);
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
+ State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+}
+
+void VPBlendRecipe::execute(VPTransformState &State) {
+ State.ILV->setDebugLocFromInst(State.Builder, Phi);
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+
+ assert((User || NumIncoming == 1) &&
+ "Multiple predecessors with predecessors having a full mask");
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ InnerLoopVectorizer::VectorParts Entry(State.UF);
+ for (unsigned In = 0; In < NumIncoming; ++In) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ Value *In0 =
+ State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+ if (In == 0)
+ Entry[Part] = In0; // Initialize with the first incoming value.
+ else {
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Value *Cond = State.get(User->getOperand(In), Part);
+ Entry[Part] =
+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+ }
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+}
+
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Interleave group being replicated.");
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+}
+
+void VPReplicateRecipe::execute(VPTransformState &State) {
+ if (State.Instance) { // Generate a single instance.
+ State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+ // Insert scalar instance packing it into a vector.
+ if (AlsoPack && State.VF > 1) {
+ // If we're constructing lane 0, initialize to start from undef.
+ if (State.Instance->Lane == 0) {
+ Value *Undef =
+ UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+ State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ }
+ State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ }
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts, unless the
+ // instruction is uniform inwhich case generate only the first lane for each
+ // of the UF parts.
+ unsigned EndLane = IsUniform ? 1 : State.VF;
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ for (unsigned Lane = 0; Lane < EndLane; ++Lane)
+ State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane;
+
+ Value *ConditionBit = nullptr;
+ if (!User) // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+ else {
+ VPValue *BlockInMask = User->getOperand(0);
+ ConditionBit = State.get(BlockInMask, Part);
+ if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+ }
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Predicated instruction PHI works per instance.");
+ Instruction *ScalarPredInst = cast<Instruction>(
+ State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+ BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+ assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+ // By current pack/unpack logic we need to generate only a single phi node: if
+ // a vector value for the predicated instruction exists at this point it means
+ // the instruction has vector users only, and a phi for the vector value is
+ // needed. In this case the recipe of the predicated instruction is marked to
+ // also do that packing, thereby "hoisting" the insert-element sequence.
+ // Otherwise, a phi node for the scalar value is needed.
+ unsigned Part = State.Instance->Part;
+ if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+ Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+ PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+ VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+ VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+ State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ } else {
+ Type *PredInstType = PredInst->getType();
+ PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+ Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(ScalarPredInst, PredicatedBB);
+ State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ }
+}
+
+void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
+ if (!User)
+ return State.ILV->vectorizeMemoryInstruction(&Instr);
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
@@ -7878,7 +8601,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7941,48 +8664,61 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
// Do not vectorize or interleaving the loop.
- ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second);
- ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
- ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
} else if (VectorizeLoop && !InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
- ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second);
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
} else if (VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ LVP.setBestPlan(VF.Width, IC);
+
using namespace ore;
+
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- LVP.executePlan(Unroller);
+ LVP.executePlan(Unroller, DT);
- ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")");
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LVP.executePlan(LB);
+ LVP.executePlan(LB, DT);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
@@ -7992,11 +8728,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
AddRuntimeUnrollDisableMetaData(L);
// Report the vectorization decision.
- ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+ });
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -8012,7 +8750,6 @@ bool LoopVectorizePass::runImpl(
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE_) {
-
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
@@ -8068,10 +8805,8 @@ bool LoopVectorizePass::runImpl(
// Process each loop nest in the function.
return Changed;
-
}
-
PreservedAnalyses LoopVectorizePass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
@@ -8088,7 +8823,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
bool Changed =