summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-05-16 19:46:52 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-05-16 19:46:52 +0000
commit6b3f41ed88e8e440e11a4fbf20b6600529f80049 (patch)
tree928b056f24a634d628c80238dbbf10d41b1a71d5 /lib/Transforms/Vectorize
parentc46e6a5940c50058e00c0c5f9123fd82e338d29a (diff)
Notes
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp241
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp112
3 files changed, 185 insertions, 170 deletions
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d721..9cf66382b5817 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (!Safe) {
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+ if (Known.countMaxTrailingOnes() < (BitWidth - 1))
Safe = true;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a4539623..516ab7d03a88e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
AddedSafetyChecks(false) {}
- // Perform the actual loop widening (vectorization).
- void vectorize() {
- // Create a new empty loop. Unlink the old loop and connect the new one.
- createEmptyLoop();
- // Widen each instruction in the old loop to a new one in the new loop.
- vectorizeLoop();
- }
+ /// Create a new empty loop. Unlink the old loop and connect the new one.
+ void createVectorizedLoopSkeleton();
+
+ /// Vectorize a single instruction within the innermost loop.
+ void vectorizeInstruction(Instruction &I);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
EdgeMaskCacheTy;
typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
- /// Create an empty loop, based on the loop ranges of the old loop.
- void createEmptyLoop();
-
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
- /// Copy and widen the instructions from the old loop.
- virtual void vectorizeLoop();
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes where the
- /// incoming value is 'Undef'. While vectorizing we only handled real values
- /// that were defined inside the loop. Here we fix the 'undef case'.
- /// See PR14725.
+ /// \brief The Loop exit block may have single value PHI nodes with some
+ /// incoming value. While vectorizing we only handled real values
+ /// that were defined inside the loop and we should have one value for
+ /// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
/// respective conditions.
void predicateInstructions();
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
/// and DST.
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
- /// A helper function to vectorize a single instruction within the innermost
- /// loop.
- void vectorizeInstruction(Instruction &I);
-
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ // Returns true if the NoNaN attribute is set on the function.
+ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
/// passed Legality checks.
class LoopVectorizationPlanner {
public:
- LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+ LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
~LoopVectorizationPlanner() {}
@@ -2193,7 +2186,25 @@ public:
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
+ /// Generate the IR code for the vectorized loop.
+ void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
private:
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
};
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
}
}
-void InnerLoopVectorizer::vectorizeLoop() {
- //===------------------------------------------------===//
- //
- // Notice: any optimization or new instruction that go
- // into the code below should be also be implemented in
- // the cost-model.
- //
- //===------------------------------------------------===//
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Set the insertion point after the previous value if it is an instruction.
// Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+ // guaranteed to be an instruction in the vector loop. Also, if the previous
+ // value is a phi node, we should insert after all the phi nodes to avoid
+ // breaking basic block verification.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+ isa<PHINode>(PreviousParts[UF - 1]))
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
else
Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
- UndefValue::get(Builder.getInt32Ty()));
-
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()),
- ConstantVector::get(ShuffleMask), "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- TmpVec = addFastMathFlag(Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
- else
- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
- TmpVec, Shuf);
- }
-
- // The result is in the first element of the vector.
+ bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
if (!LCSSAPhi)
break;
- if (LCSSAPhi->getNumIncomingValues() == 1)
- LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
- LoopMiddleBlock);
- }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
-
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
-
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
- for (auto &Induction : *Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
+ if (LCSSAPhi->getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ "Incoming value isn't loop invariant");
+ LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ }
}
}
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ ILV.createVectorizedLoopSkeleton();
+
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ // Vectorize all instructions in the original loop that will not become
+ // trivially dead when vectorized.
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+ for (Instruction &I : *BB)
+ if (!DeadInstructions.count(&I))
+ ILV.vectorizeInstruction(I);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(CM);
+ LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- Unroller.vectorize();
+ LVP.executePlan(Unroller);
ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LB.vectorize();
+ LVP.executePlan(LB);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c3..64013d6d687df 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
- // VecOVp is initialized to the 0th scalar, so start counting from index
- // '1'.
- VecOp->copyIRFlags(I0);
- for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- VecOp->andIRFlags(Scalar);
- }
- }
- }
-}
-
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
- const DataLayout *DL)
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), Builder(Se->getContext()) {
+ DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,7 +316,10 @@ public:
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
- MinVecRegSize = MinVectorRegSizeOption;
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
MinBWs.clear();
}
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -415,6 +405,8 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
private:
struct TreeEntry;
@@ -944,6 +936,8 @@ private:
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
- int ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
- Op2VK, Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
- bool KnownZero = false;
- bool KnownOne = false;
- ComputeSignBit(R, KnownZero, KnownOne, *DL);
- return KnownZero;
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
});
// Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
if (!Changed)
return PreservedAnalyses::all();
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
- AssumptionCache *AC_, DemandedBits *DB_) {
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ using namespace ore;
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[i]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
R.vectorizeTree();
// Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
+ auto *I0 = cast<Instruction>(VL[0]);
+ V.getORE()->emit(
+ OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize()));
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+ const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ if (!IsPairwiseReduction)
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionOpcode, VectorizedValue,
+ TargetTransformInfo::ReductionFlags(), RedOps);
+
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- if (IsPairwiseReduction) {
- Value *LeftMask =
+ Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
+ Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
- Value *LeftShuf = Builder.CreateShuffleVector(
+ Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
+ Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
- } else {
- Value *UpperHalf =
- createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
- }
+ TmpVec =
+ Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {