diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
commit | 6b3f41ed88e8e440e11a4fbf20b6600529f80049 (patch) | |
tree | 928b056f24a634d628c80238dbbf10d41b1a71d5 /lib/Transforms/Vectorize | |
parent | c46e6a5940c50058e00c0c5f9123fd82e338d29a (diff) |
Notes
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 2 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 241 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 112 |
3 files changed, 185 insertions, 170 deletions
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 97dcb40a1d721..9cf66382b5817 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { if (!Safe) { KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.Zero.countTrailingZeros() < (BitWidth - 1)) + if (Known.countMaxTrailingOnes() < (BitWidth - 1)) Safe = true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a4539623..516ab7d03a88e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,13 +391,14 @@ public: TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), AddedSafetyChecks(false) {} - // Perform the actual loop widening (vectorization). - void vectorize() { - // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); - // Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); - } + /// Create a new empty loop. Unlink the old loop and connect the new one. + void createVectorizedLoopSkeleton(); + + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); + + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. + void fixVectorizedLoop(); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -425,9 +426,6 @@ protected: EdgeMaskCacheTy; typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); - /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, @@ -436,8 +434,6 @@ protected: /// Create a new induction variable inside L. PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL); - /// Copy and widen the instructions from the old loop. - virtual void vectorizeLoop(); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(); @@ -450,10 +446,10 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes where the - /// incoming value is 'Undef'. While vectorizing we only handled real values - /// that were defined inside the loop. Here we fix the 'undef case'. - /// See PR14725. + /// \brief The Loop exit block may have single value PHI nodes with some + /// incoming value. While vectorizing we only handled real values + /// that were defined inside the loop and we should have one value for + /// each predecessor of its parent basic block. See PR14725. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -464,11 +460,6 @@ protected: /// respective conditions. void predicateInstructions(); - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); @@ -481,10 +472,6 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single instruction within the innermost - /// loop. - void vectorizeInstruction(Instruction &I); - /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -1700,6 +1687,9 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -2185,7 +2175,10 @@ public: /// passed Legality checks. class LoopVectorizationPlanner { public: - LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM) + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} ~LoopVectorizationPlanner() {} @@ -2193,7 +2186,25 @@ public: LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); + +protected: + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); + private: + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + /// The profitablity analysis. LoopVectorizationCostModel &CM; }; @@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } } -void InnerLoopVectorizer::vectorizeLoop() { - //===------------------------------------------------===// - // - // Notice: any optimization or new instruction that go - // into the code below should be also be implemented in - // the cost-model. - // - //===------------------------------------------------===// - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - vectorizeInstruction(I); - +void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF > 1) @@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Set the insertion point after the previous value if it is an instruction. // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + // guaranteed to be an instruction in the vector loop. Also, if the previous + // value is a phi node, we should insert after all the phi nodes to avoid + // breaking basic block verification. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || + isa<PHINode>(PreviousParts[UF - 1])) Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); else Builder.SetInsertPoint( @@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. + bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); if (!LCSSAPhi) break; - if (LCSSAPhi->getNumIncomingValues() == 1) - LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), - LoopMiddleBlock); - } -} - -void InnerLoopVectorizer::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); + if (LCSSAPhi->getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + "Incoming value isn't loop invariant"); + LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + } } } @@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { + // Perform the actual loop transformation. + + // 1. Create a new empty loop. Unlink the old loop and connect the new one. + ILV.createVectorizedLoopSkeleton(); + + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should also be implemented in + // the cost-model. + // + //===------------------------------------------------===// + + // 2. Copy and widen instructions from the old loop into the new loop. + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); +} + +void LoopVectorizationPlanner::collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions) { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast<StoreInst>(Instr); bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); @@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - Unroller.vectorize(); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LB.vectorize(); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f112c555205c3..64013d6d687df 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -40,7 +40,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <memory> @@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { return Opcode; } -/// Get the intersection (logical and) of all of the potential IR flags -/// of each scalar operation (VL) that will be converted into a vector (I). -/// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { - if (auto *VecOp = dyn_cast<Instruction>(I)) { - if (auto *I0 = dyn_cast<Instruction>(VL[0])) { - // VecOVp is initialized to the 0th scalar, so start counting from index - // '1'. - VecOp->copyIRFlags(I0); - for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast<Instruction>(VL[i])) - VecOp->andIRFlags(Scalar); - } - } - } -} - /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef<Value *> VL) { @@ -315,10 +300,10 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, - const DataLayout *DL) + const DataLayout *DL, OptimizationRemarkEmitter *ORE) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), Builder(Se->getContext()) { + DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -331,7 +316,10 @@ public: else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -377,6 +365,8 @@ public: MinBWs.clear(); } + unsigned getTreeSize() const { return VectorizableTree.size(); } + /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -415,6 +405,8 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + OptimizationRemarkEmitter *getORE() { return ORE; } + private: struct TreeEntry; @@ -944,6 +936,8 @@ private: AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; + OptimizationRemarkEmitter *ORE; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. @@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); + SmallVector<const Value *, 4> Operands(VL0->operand_values()); + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); + Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() { // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. IsKnownPositive = all_of(TreeRoot, [&](Value *R) { - bool KnownZero = false; - bool KnownOne = false; - ComputeSignBit(R, KnownZero, KnownOne, *DL); - return KnownZero; + KnownBits Known = computeKnownBits(R, *DL); + return Known.isNonNegative(); }); // Determine the maximum number of bits required to store the scalar @@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass { auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<DemandedBitsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); @@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); auto *AC = &AM.getResult<AssumptionAnalysis>(F); auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); + auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); if (!Changed) return PreservedAnalyses::all(); @@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_) { + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_) { SE = SE_; TTI = TTI_; TLI = TLI_; @@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + using namespace ore; + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast<StoreInst>(Chain[i])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + R.vectorizeTree(); // Move to the next bundle. @@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", + cast<Instruction>(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); + Value *VectorizedRoot = R.vectorizeTree(); // Reconstruct the build vector by extracting the vectorized root. This @@ -4506,6 +4517,12 @@ public: DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); + auto *I0 = cast<Instruction>(VL[0]); + V.getORE()->emit( + OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize())); // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); @@ -4513,7 +4530,7 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, @@ -4583,33 +4600,31 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth, ArrayRef<Value *> RedOps) { + unsigned ReduxWidth, ArrayRef<Value *> RedOps, + const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + if (!IsPairwiseReduction) + return createSimpleTargetReduction( + Builder, TTI, ReductionOpcode, VectorizedValue, + TargetTransformInfo::ReductionFlags(), RedOps); + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - if (IsPairwiseReduction) { - Value *LeftMask = + Value *LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = + Value *RightMask = createRdxShuffleMask(ReduxWidth, i, true, false, Builder); - Value *LeftShuf = Builder.CreateShuffleVector( + Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( + Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); - } else { - Value *UpperHalf = - createRdxShuffleMask(ReduxWidth, i, false, false, Builder); - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); - } + TmpVec = + Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); propagateIRFlags(TmpVec, RedOps); } @@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { |