diff options
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 298 |
1 files changed, 259 insertions, 39 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 30b98ec88c243..783d22fafee9d 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -346,7 +346,7 @@ class MachineBlockPlacement : public MachineFunctionPass { const MachineBranchProbabilityInfo *MBPI; /// A handle to the function-wide block frequency pass. - std::unique_ptr<BranchFolder::MBFIWrapper> MBFI; + std::unique_ptr<MBFIWrapper> MBFI; /// A handle to the loop info. MachineLoopInfo *MLI; @@ -374,6 +374,9 @@ class MachineBlockPlacement : public MachineFunctionPass { /// must be done inline. TailDuplicator TailDup; + /// Partial tail duplication threshold. + BlockFrequency DupThreshold; + /// Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -399,6 +402,10 @@ class MachineBlockPlacement : public MachineFunctionPass { SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits; #endif + /// Scale the DupThreshold according to basic block size. + BlockFrequency scaleThreshold(MachineBasicBlock *BB); + void initDupThreshold(); + /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. void markChainSuccessors( @@ -421,6 +428,11 @@ class MachineBlockPlacement : public MachineFunctionPass { const MachineBasicBlock *BB, const MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter); + void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, const MachineBasicBlock *LoopHeaderBB, @@ -1141,6 +1153,11 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( if (NumDup == 0) return false; + // If profile information is available, findDuplicateCandidates can do more + // precise benefit analysis. + if (F->getFunction().hasProfileData()) + return true; + // This is mainly for function exit BB. // The integrated tail duplication is really designed for increasing // fallthrough from predecessors from Succ to its successors. We may need @@ -1169,9 +1186,6 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( // // A small number of extra duplication may not hurt too much. We need a better // heuristic to handle it. - // - // FIXME: we should selectively tail duplicate a BB into part of its - // predecessors. if ((NumDup > Succ->succ_size()) || !Duplicate) return false; @@ -1556,7 +1570,7 @@ MachineBlockPlacement::selectBestSuccessor( // For blocks with CFG violations, we may be able to lay them out anyway with // tail-duplication. We keep this vector so we can perform the probability // calculations the minimum number of times. - SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> + SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4> DupCandidates; for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); @@ -1570,7 +1584,7 @@ MachineBlockPlacement::selectBestSuccessor( Chain, BlockFilter)) { // If tail duplication would make Succ profitable, place it. if (allowTailDupPlacement() && shouldTailDuplicate(Succ)) - DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); + DupCandidates.emplace_back(SuccProb, Succ); continue; } @@ -1799,11 +1813,11 @@ void MachineBlockPlacement::buildChain( // Placement may have changed tail duplication opportunities. // Check for that now. if (allowTailDupPlacement() && BestSucc && ShouldTailDup) { - // If the chosen successor was duplicated into all its predecessors, - // don't bother laying it out, just go round the loop again with BB as - // the chain end. - if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, - BlockFilter, PrevUnplacedBlockIt)) + repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, + BlockFilter, PrevUnplacedBlockIt); + // If the chosen successor was duplicated into BB, don't bother laying + // it out, just go round the loop again with BB as the chain end. + if (!BB->isSuccessor(BestSucc)) continue; } @@ -2082,8 +2096,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, // In practice this never happens though: there always seems to be a preheader // that can fallthrough and that is also placed before the header. bool OptForSize = F->getFunction().hasOptSize() || - llvm::shouldOptimizeForSize(L.getHeader(), PSI, - &MBFI->getMBFI()); + llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()); if (OptForSize) return L.getHeader(); @@ -2616,7 +2629,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. - SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. + SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI) { MachineBasicBlock *BB = &*FI; @@ -2626,7 +2639,7 @@ void MachineBlockPlacement::buildCFGChains() { // the exact fallthrough behavior for. while (true) { Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) break; @@ -2690,6 +2703,20 @@ void MachineBlockPlacement::buildCFGChains() { assert(!BadFunc && "Detected problems with the block placement."); }); + // Remember original layout ordering, so we can update terminators after + // reordering to point to the original layout successor. + SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors( + F->getNumBlockIDs()); + { + MachineBasicBlock *LastMBB = nullptr; + for (auto &MBB : *F) { + if (LastMBB != nullptr) + OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB; + LastMBB = &MBB; + } + OriginalLayoutSuccessors[F->back().getNumber()] = nullptr; + } + // Splice the blocks into place. MachineFunction::iterator InsertPos = F->begin(); LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n"); @@ -2711,7 +2738,7 @@ void MachineBlockPlacement::buildCFGChains() { // than assert when the branch cannot be analyzed in order to remove this // boiler plate. Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. #ifndef NDEBUG if (!BlocksWithUnanalyzableExits.count(PrevBB)) { @@ -2747,15 +2774,18 @@ void MachineBlockPlacement::buildCFGChains() { // TBB = FBB = nullptr; // } // } - if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) - PrevBB->updateTerminator(); + if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) { + PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); + } } // Fixup the last block. Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. - if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) - F->back().updateTerminator(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. + if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) { + MachineBasicBlock *PrevBB = &F->back(); + PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); + } BlockWorkList.clear(); EHPadWorkList.clear(); @@ -2763,17 +2793,17 @@ void MachineBlockPlacement::buildCFGChains() { void MachineBlockPlacement::optimizeBranches() { BlockChain &FunctionChain = *BlockToChain[&F->front()]; - SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. + SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. // Now that all the basic blocks in the chain have the proper layout, - // make a final call to AnalyzeBranch with AllowModify set. + // make a final call to analyzeBranch with AllowModify set. // Indeed, the target may be able to optimize the branches in a way we // cannot because all branches may not be analyzable. // E.g., the target may be able to remove an unconditional branch to // a fallthrough when it occurs after predicated terminators. for (MachineBasicBlock *ChainBB : FunctionChain) { Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) { // If PrevBB has a two-way branch, try to re-order the branches // such that we branch to the successor with higher probability first. @@ -2789,7 +2819,6 @@ void MachineBlockPlacement::optimizeBranches() { DebugLoc dl; // FIXME: this is nowhere TII->removeBranch(*ChainBB); TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl); - ChainBB->updateTerminator(); } } } @@ -2841,7 +2870,7 @@ void MachineBlockPlacement::alignBlocks() { continue; // If the global profiles indicates so, don't align it. - if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) && + if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) && !TLI->alignLoopsWithOptSize()) continue; @@ -2901,10 +2930,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( // duplicated into is still small enough to be duplicated again. // No need to call markBlockSuccessors in this case, as the blocks being // duplicated from here on are already scheduled. - // Note that DuplicatedToLPred always implies Removed. - while (DuplicatedToLPred) { - assert(Removed && "Block must have been removed to be duplicated into its " - "layout predecessor."); + while (DuplicatedToLPred && Removed) { MachineBasicBlock *DupBB, *DupPred; // The removal callback causes Chain.end() to be updated when a block is // removed. On the first pass through the loop, the chain end should be the @@ -2943,8 +2969,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( /// chosen in the given order due to unnatural CFG /// only needed if \p BB is removed and /// \p PrevUnplacedBlockIt pointed to \p BB. -/// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will -/// only be true if the block was removed. +/// \p DuplicatedToLPred - True if the block was duplicated into LPred. /// \return - True if the block was duplicated into all preds and removed. bool MachineBlockPlacement::maybeTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *LPred, @@ -3012,8 +3037,18 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; bool IsSimple = TailDup.isSimpleBB(BB); - TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, - &DuplicatedPreds, &RemovalCallbackRef); + SmallVector<MachineBasicBlock *, 8> CandidatePreds; + SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr; + if (F->getFunction().hasProfileData()) { + // We can do partial duplication with precise profile information. + findDuplicateCandidates(CandidatePreds, BB, BlockFilter); + if (CandidatePreds.size() == 0) + return false; + if (CandidatePreds.size() < BB->pred_size()) + CandidatePtr = &CandidatePreds; + } + TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, + &RemovalCallbackRef, CandidatePtr); // Update UnscheduledPredecessors to reflect tail-duplication. DuplicatedToLPred = false; @@ -3036,6 +3071,191 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( return Removed; } +// Count the number of actual machine instructions. +static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { + uint64_t InstrCount = 0; + for (MachineInstr &MI : *MBB) { + if (!MI.isPHI() && !MI.isMetaInstruction()) + InstrCount += 1; + } + return InstrCount; +} + +// The size cost of duplication is the instruction size of the duplicated block. +// So we should scale the threshold accordingly. But the instruction size is not +// available on all targets, so we use the number of instructions instead. +BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) { + return DupThreshold.getFrequency() * countMBBInstruction(BB); +} + +// Returns true if BB is Pred's best successor. +bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, + MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter) { + if (BB == Pred) + return false; + if (BlockFilter && !BlockFilter->count(Pred)) + return false; + BlockChain *PredChain = BlockToChain[Pred]; + if (PredChain && (Pred != *std::prev(PredChain->end()))) + return false; + + // Find the successor with largest probability excluding BB. + BranchProbability BestProb = BranchProbability::getZero(); + for (MachineBasicBlock *Succ : Pred->successors()) + if (Succ != BB) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain *SuccChain = BlockToChain[Succ]; + if (SuccChain && (Succ != *SuccChain->begin())) + continue; + BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ); + if (SuccProb > BestProb) + BestProb = SuccProb; + } + + BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB); + if (BBProb <= BestProb) + return false; + + // Compute the number of reduced taken branches if Pred falls through to BB + // instead of another successor. Then compare it with threshold. + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + BlockFrequency Gain = PredFreq * (BBProb - BestProb); + return Gain > scaleThreshold(BB); +} + +// Find out the predecessors of BB and BB can be beneficially duplicated into +// them. +void MachineBlockPlacement::findDuplicateCandidates( + SmallVectorImpl<MachineBasicBlock *> &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter) { + MachineBasicBlock *Fallthrough = nullptr; + BranchProbability DefaultBranchProb = BranchProbability::getZero(); + BlockFrequency BBDupThreshold(scaleThreshold(BB)); + SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end()); + SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end()); + + // Sort for highest frequency. + auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B); + }; + auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); + }; + llvm::stable_sort(Succs, CmpSucc); + llvm::stable_sort(Preds, CmpPred); + + auto SuccIt = Succs.begin(); + if (SuccIt != Succs.end()) { + DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl(); + } + + // For each predecessors of BB, compute the benefit of duplicating BB, + // if it is larger than the threshold, add it into Candidates. + // + // If we have following control flow. + // + // PB1 PB2 PB3 PB4 + // \ | / /\ + // \ | / / \ + // \ |/ / \ + // BB----/ OB + // /\ + // / \ + // SB1 SB2 + // + // And it can be partially duplicated as + // + // PB2+BB + // | PB1 PB3 PB4 + // | | / /\ + // | | / / \ + // | |/ / \ + // | BB----/ OB + // |\ /| + // | X | + // |/ \| + // SB2 SB1 + // + // The benefit of duplicating into a predecessor is defined as + // Orig_taken_branch - Duplicated_taken_branch + // + // The Orig_taken_branch is computed with the assumption that predecessor + // jumps to BB and the most possible successor is laid out after BB. + // + // The Duplicated_taken_branch is computed with the assumption that BB is + // duplicated into PB, and one successor is layout after it (SB1 for PB1 and + // SB2 for PB2 in our case). If there is no available successor, the combined + // block jumps to all BB's successor, like PB3 in this example. + // + // If a predecessor has multiple successors, so BB can't be duplicated into + // it. But it can beneficially fall through to BB, and duplicate BB into other + // predecessors. + for (MachineBasicBlock *Pred : Preds) { + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + + if (!TailDup.canTailDuplicate(BB, Pred)) { + // BB can't be duplicated into Pred, but it is possible to be layout + // below Pred. + if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) { + Fallthrough = Pred; + if (SuccIt != Succs.end()) + SuccIt++; + } + continue; + } + + BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb; + BlockFrequency DupCost; + if (SuccIt == Succs.end()) { + // Jump to all successors; + if (Succs.size() > 0) + DupCost += PredFreq; + } else { + // Fallthrough to *SuccIt, jump to all other successors; + DupCost += PredFreq; + DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt); + } + + assert(OrigCost >= DupCost); + OrigCost -= DupCost; + if (OrigCost > BBDupThreshold) { + Candidates.push_back(Pred); + if (SuccIt != Succs.end()) + SuccIt++; + } + } + + // No predecessors can optimally fallthrough to BB. + // So we can change one duplication into fallthrough. + if (!Fallthrough) { + if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) { + Candidates[0] = Candidates.back(); + Candidates.pop_back(); + } + } +} + +void MachineBlockPlacement::initDupThreshold() { + DupThreshold = 0; + if (!F->getFunction().hasProfileData()) + return; + + BlockFrequency MaxFreq = 0; + for (MachineBasicBlock &MBB : *F) { + BlockFrequency Freq = MBFI->getBlockFreq(&MBB); + if (Freq > MaxFreq) + MaxFreq = Freq; + } + + // FIXME: we may use profile count instead of frequency, + // and need more fine tuning. + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + DupThreshold = MaxFreq * ThresholdProb; +} + bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -3046,7 +3266,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { F = &MF; MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); - MBFI = std::make_unique<BranchFolder::MBFIWrapper>( + MBFI = std::make_unique<MBFIWrapper>( getAnalysis<MachineBlockFrequencyInfo>()); MLI = &getAnalysis<MachineLoopInfo>(); TII = MF.getSubtarget().getInstrInfo(); @@ -3054,6 +3274,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MPDT = nullptr; PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + initDupThreshold(); + // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; @@ -3088,7 +3310,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (OptForSize) TailDupSize = 1; bool PreRegAlloc = false; - TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI, + TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI, /* LayoutMode */ true, TailDupSize); precomputeTriangleChains(); } @@ -3107,9 +3329,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, PSI, TailMergeSize); - auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>(); - if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), - MMIWP ? &MMIWP->getMMI() : nullptr, MLI, + if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI, /*AfterPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); |