diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 241 |
1 files changed, 120 insertions, 121 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a453962..516ab7d03a88 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,13 +391,14 @@ public: TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), AddedSafetyChecks(false) {} - // Perform the actual loop widening (vectorization). - void vectorize() { - // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); - // Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); - } + /// Create a new empty loop. Unlink the old loop and connect the new one. + void createVectorizedLoopSkeleton(); + + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); + + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. + void fixVectorizedLoop(); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -425,9 +426,6 @@ protected: EdgeMaskCacheTy; typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); - /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, @@ -436,8 +434,6 @@ protected: /// Create a new induction variable inside L. PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL); - /// Copy and widen the instructions from the old loop. - virtual void vectorizeLoop(); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(); @@ -450,10 +446,10 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes where the - /// incoming value is 'Undef'. While vectorizing we only handled real values - /// that were defined inside the loop. Here we fix the 'undef case'. - /// See PR14725. + /// \brief The Loop exit block may have single value PHI nodes with some + /// incoming value. While vectorizing we only handled real values + /// that were defined inside the loop and we should have one value for + /// each predecessor of its parent basic block. See PR14725. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -464,11 +460,6 @@ protected: /// respective conditions. void predicateInstructions(); - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); @@ -481,10 +472,6 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single instruction within the innermost - /// loop. - void vectorizeInstruction(Instruction &I); - /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -1700,6 +1687,9 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -2185,7 +2175,10 @@ public: /// passed Legality checks. class LoopVectorizationPlanner { public: - LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM) + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} ~LoopVectorizationPlanner() {} @@ -2193,7 +2186,25 @@ public: LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); + +protected: + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); + private: + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + /// The profitablity analysis. LoopVectorizationCostModel &CM; }; @@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } } -void InnerLoopVectorizer::vectorizeLoop() { - //===------------------------------------------------===// - // - // Notice: any optimization or new instruction that go - // into the code below should be also be implemented in - // the cost-model. - // - //===------------------------------------------------===// - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - vectorizeInstruction(I); - +void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF > 1) @@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Set the insertion point after the previous value if it is an instruction. // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + // guaranteed to be an instruction in the vector loop. Also, if the previous + // value is a phi node, we should insert after all the phi nodes to avoid + // breaking basic block verification. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || + isa<PHINode>(PreviousParts[UF - 1])) Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); else Builder.SetInsertPoint( @@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. + bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); if (!LCSSAPhi) break; - if (LCSSAPhi->getNumIncomingValues() == 1) - LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), - LoopMiddleBlock); - } -} - -void InnerLoopVectorizer::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); + if (LCSSAPhi->getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + "Incoming value isn't loop invariant"); + LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + } } } @@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { + // Perform the actual loop transformation. + + // 1. Create a new empty loop. Unlink the old loop and connect the new one. + ILV.createVectorizedLoopSkeleton(); + + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should also be implemented in + // the cost-model. + // + //===------------------------------------------------===// + + // 2. Copy and widen instructions from the old loop into the new loop. + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); +} + +void LoopVectorizationPlanner::collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions) { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast<StoreInst>(Instr); bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); @@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - Unroller.vectorize(); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LB.vectorize(); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are |