diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 314 |
1 files changed, 174 insertions, 140 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37ae13666f7a..99c265fc5101 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -435,7 +435,7 @@ struct InstructionsState { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return AltOp != MainOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -581,7 +581,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AAResults *AA) { +static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -1417,7 +1417,11 @@ public: HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { - ++HashMap[NumFreeOpsHash.Hash].first; + auto It = HashMap.find(NumFreeOpsHash.Hash); + if (It == HashMap.end()) + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + else + ++It->second.first; } } // Select the lane with the minimum counter. @@ -2019,9 +2023,7 @@ private: } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { - return getOpcode() != getAltOpcode(); - } + bool isAltShuffle() const { return MainOp != AltOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -2519,12 +2521,11 @@ private: SD->IsScheduled = true; LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ScheduleData *BundleMember = SD; - while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { + if (BundleMember->Inst != BundleMember->OpValue) continue; - } + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2589,7 +2590,6 @@ private: << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } - BundleMember = BundleMember->NextInBundle; } } @@ -2618,6 +2618,10 @@ private: } } + /// Build a bundle from the ScheduleData nodes corresponding to the + /// scalar instruction for each lane. + ScheduleData *buildBundle(ArrayRef<Value *> VL); + /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. @@ -3040,7 +3044,7 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. - DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; + DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; @@ -3051,6 +3055,29 @@ void BoUpSLP::reorderTopToBottom() { const std::unique_ptr<TreeEntry> &TE) { if (Optional<OrdersType> CurrentOrder = getReorderingData(*TE.get(), /*TopToBottom=*/true)) { + // Do not include ordering for nodes used in the alt opcode vectorization, + // better to reorder them during bottom-to-top stage. If follow the order + // here, it causes reordering of the whole graph though actually it is + // profitable just to reorder the subgraph that starts from the alternate + // opcode vectorization node. Such nodes already end-up with the shuffle + // instruction and it is just enough to change this shuffle rather than + // rotate the scalars for the whole graph. + unsigned Cnt = 0; + const TreeEntry *UserTE = TE.get(); + while (UserTE && Cnt < RecursionMaxDepth) { + if (UserTE->UserTreeIndices.size() != 1) + break; + if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { + return EI.UserTE->State == TreeEntry::Vectorize && + EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; + })) + return; + if (UserTE->UserTreeIndices.empty()) + UserTE = nullptr; + else + UserTE = UserTE->UserTreeIndices.back().UserTE; + ++Cnt; + } VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -3066,7 +3093,7 @@ void BoUpSLP::reorderTopToBottom() { // Try to find the most profitable order. We just are looking for the most // used order and reorder scalar elements in the nodes according to this // mostly used order. - const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond(); + ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); // All operands are reordered and used only in this node - propagate the // most used order to the user node. MapVector<OrdersType, unsigned, @@ -4459,6 +4486,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, CurrentOrder.clear(); return false; } + if (ShouldKeepOrder) + CurrentOrder.clear(); return ShouldKeepOrder; } @@ -7202,6 +7231,33 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleSeq.clear(); } +BoUpSLP::ScheduleData * +BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { + ScheduleData *Bundle = nullptr; + ScheduleData *PrevInBundle = nullptr; + for (Value *V : VL) { + ScheduleData *BundleMember = getScheduleData(V); + assert(BundleMember && + "no ScheduleData for bundle member " + "(maybe not in same basic block)"); + assert(BundleMember->isSchedulingEntity() && + "bundle member already part of other bundle"); + if (PrevInBundle) { + PrevInBundle->NextInBundle = BundleMember; + } else { + Bundle = BundleMember; + } + BundleMember->UnscheduledDepsInBundle = 0; + Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; + + // Group the instructions to a bundle. + BundleMember->FirstInBundle = Bundle; + PrevInBundle = BundleMember; + } + assert(Bundle && "Failed to find schedule bundle"); + return Bundle; +} + // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. Optional<BoUpSLP::ScheduleData *> @@ -7214,12 +7270,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; - ScheduleData *PrevInBundle = nullptr; - ScheduleData *Bundle = nullptr; - bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); - auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, + auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, ScheduleData *Bundle) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to @@ -7263,39 +7316,28 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Otherwise the compiler may crash trying to incorrectly calculate // dependencies and emit instruction in the wrong order at the actual // scheduling. - TryScheduleBundle(/*ReSchedule=*/false, nullptr); + TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); return None; } } + bool ReSchedule = false; for (Value *V : VL) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); - if (BundleMember->IsScheduled) { - // A bundle member was scheduled as single instruction before and now - // needs to be scheduled as part of the bundle. We just get rid of the - // existing schedule. - LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember - << " was already scheduled\n"); - ReSchedule = true; - } - assert(BundleMember->isSchedulingEntity() && - "bundle member already part of other bundle"); - if (PrevInBundle) { - PrevInBundle->NextInBundle = BundleMember; - } else { - Bundle = BundleMember; - } - BundleMember->UnscheduledDepsInBundle = 0; - Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; - - // Group the instructions to a bundle. - BundleMember->FirstInBundle = Bundle; - PrevInBundle = BundleMember; + if (!BundleMember->IsScheduled) + continue; + // A bundle member was scheduled as single instruction before and now + // needs to be scheduled as part of the bundle. We just get rid of the + // existing schedule. + LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember + << " was already scheduled\n"); + ReSchedule = true; } - assert(Bundle && "Failed to find schedule bundle"); - TryScheduleBundle(ReSchedule, Bundle); + + auto *Bundle = buildBundle(VL); + TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); return None; @@ -7464,20 +7506,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, while (!WorkList.empty()) { ScheduleData *SD = WorkList.pop_back_val(); - - ScheduleData *BundleMember = SD; - while (BundleMember) { + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { assert(isInSchedulingRegion(BundleMember)); - if (!BundleMember->hasValidDependencies()) { + if (BundleMember->hasValidDependencies()) + continue; - LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember - << "\n"); - BundleMember->Dependencies = 0; - BundleMember->resetUnscheduledDeps(); + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember + << "\n"); + BundleMember->Dependencies = 0; + BundleMember->resetUnscheduledDeps(); - // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + // Handle def-use chain dependencies. + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + for (User *U : BundleMember->Inst->users()) { + assert(isa<Instruction>(U) && + "user of instruction must be instruction"); + ScheduleData *UseSD = getScheduleData(U); if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; @@ -7486,89 +7541,69 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, if (!DestBundle->hasValidDependencies()) WorkList.push_back(DestBundle); } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa<Instruction>(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } - } } + } - // Handle the memory dependencies. - ScheduleData *DepDest = BundleMember->NextLoadStore; - if (DepDest) { - Instruction *SrcInst = BundleMember->Inst; - MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); - unsigned numAliased = 0; - unsigned DistToSrc = 1; - - while (DepDest) { - assert(isInSchedulingRegion(DepDest)); + // Handle the memory dependencies (if any). + ScheduleData *DepDest = BundleMember->NextLoadStore; + if (!DepDest) + continue; + Instruction *SrcInst = BundleMember->Inst; + assert(SrcInst->mayReadOrWriteMemory() && + "NextLoadStore list for non memory effecting bundle?"); + MemoryLocation SrcLoc = getLocation(SrcInst); + bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; - // We have two limits to reduce the complexity: - // 1) AliasedCheckLimit: It's a small limit to reduce calls to - // SLP->isAliased (which is the expensive part in this loop). - // 2) MaxMemDepDistance: It's for very large blocks and it aborts - // the whole loop (even if the loop is fast, it's quadratic). - // It's important for the loop break condition (see below) to - // check this limit even between two read-only instructions. - if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && - (numAliased >= AliasedCheckLimit || - SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + for ( ; DepDest; DepDest = DepDest->NextLoadStore) { + assert(isInSchedulingRegion(DepDest)); - // We increment the counter only if the locations are aliased - // (instead of counting all alias checks). This gives a better - // balance between reduced runtime and accurate dependencies. - numAliased++; + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } - } - DepDest = DepDest->NextLoadStore; + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; - // Example, explaining the loop break condition: Let's assume our - // starting instruction is i0 and MaxMemDepDistance = 3. - // - // +--------v--v--v - // i0,i1,i2,i3,i4,i5,i6,i7,i8 - // +--------^--^--^ - // - // MaxMemDepDistance let us stop alias-checking at i3 and we add - // dependencies from i0 to i3,i4,.. (even if they are not aliased). - // Previously we already added dependencies from i3 to i6,i7,i8 - // (because of MaxMemDepDistance). As we added a dependency from - // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 - // and we can abort this loop at i6. - if (DistToSrc >= 2 * MaxMemDepDistance) - break; - DistToSrc++; + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } - BundleMember = BundleMember->NextInBundle; } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); @@ -7638,8 +7673,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. - ScheduleData *BundleMember = picked; - while (BundleMember) { + for (ScheduleData *BundleMember = picked; BundleMember; + BundleMember = BundleMember->NextInBundle) { Instruction *pickedInst = BundleMember->Inst; if (pickedInst->getNextNode() != LastScheduledInst) { BS->BB->getInstList().remove(pickedInst); @@ -7647,7 +7682,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { pickedInst); } LastScheduledInst = pickedInst; - BundleMember = BundleMember->NextInBundle; } BS->schedule(picked, ReadyInsts); @@ -8045,8 +8079,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { + LLVM_DEBUG( + dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); return false; + } // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) @@ -8693,7 +8730,6 @@ class HorizontalReduction { static RecurKind getRdxKind(Instruction *I) { assert(I && "Expected instruction for reduction matching"); - TargetTransformInfo::ReductionFlags RdxFlags; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) @@ -8767,7 +8803,6 @@ class HorizontalReduction { return RecurKind::None; } - TargetTransformInfo::ReductionFlags RdxFlags; switch (Pred) { default: return RecurKind::None; @@ -9206,7 +9241,7 @@ private: auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*unsigned=*/false, CostKind); + /*IsUnsigned=*/false, CostKind); CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -9571,8 +9606,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); - // Aggregate value is unlikely to be processed in vector register, we need to - // extract scalars into scalar registers, so NeedExtraction is set true. + // Aggregate value is unlikely to be processed in vector register. return tryToVectorizeList(BuildVectorOpds, R); } @@ -9598,7 +9632,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, function_ref<unsigned(T *)> Limit, function_ref<bool(T *, T *)> Comparator, function_ref<bool(T *, T *)> AreCompatible, - function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize, + function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool LimitForRegisterSize) { bool Changed = false; // Sort by type, parent, operands. @@ -9627,7 +9661,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // same/alternate ops only, this may result in some extra final // vectorization. if (NumElts > 1 && - TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { // Success start over because instructions might have been changed. Changed = true; } else if (NumElts < Limit(*IncIt) && @@ -9638,7 +9672,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // Final attempt to vectorize instructions with the same types. if (Candidates.size() > 1 && (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) { + if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) { // Success start over because instructions might have been changed. Changed = true; } else if (LimitForRegisterSize) { @@ -9649,7 +9683,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) ++SameTypeIt; unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts), + if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts), /*LimitForRegisterSize=*/false)) Changed = true; It = SameTypeIt; |
